@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
252252; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
253253; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
254254; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
255- ; GCN-NEXT: v_mov_b32_e32 v8 , 0
255+ ; GCN-NEXT: v_mov_b32_e32 v44 , 0
256256; GCN-NEXT: s_waitcnt lgkmcnt(0)
257- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
258- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
259- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
260- ; GCN-NEXT: v_accvgpr_write_b32 a31, s23
261- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
262- ; GCN-NEXT: v_accvgpr_write_b32 a30, s22
263- ; GCN-NEXT: v_accvgpr_write_b32 a29, s21
264- ; GCN-NEXT: v_accvgpr_write_b32 a28, s20
265- ; GCN-NEXT: v_accvgpr_write_b32 a27, s19
266- ; GCN-NEXT: v_accvgpr_write_b32 a26, s18
267- ; GCN-NEXT: v_accvgpr_write_b32 a25, s17
268- ; GCN-NEXT: v_accvgpr_write_b32 a24, s16
269- ; GCN-NEXT: v_accvgpr_write_b32 a23, s15
270- ; GCN-NEXT: v_accvgpr_write_b32 a22, s14
271- ; GCN-NEXT: v_accvgpr_write_b32 a21, s13
272- ; GCN-NEXT: v_accvgpr_write_b32 a20, s12
273- ; GCN-NEXT: v_accvgpr_write_b32 a19, s11
274- ; GCN-NEXT: v_accvgpr_write_b32 a18, s10
275- ; GCN-NEXT: v_accvgpr_write_b32 a17, s9
276- ; GCN-NEXT: v_accvgpr_write_b32 a16, s8
277- ; GCN-NEXT: v_mov_b32_e32 v10, s20
278- ; GCN-NEXT: v_mov_b32_e32 v11, s21
279- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
280- ; GCN-NEXT: v_mov_b32_e32 v12, s22
281- ; GCN-NEXT: v_mov_b32_e32 v13, s23
282- ; GCN-NEXT: v_mov_b32_e32 v0, s16
283- ; GCN-NEXT: v_mov_b32_e32 v1, s17
284- ; GCN-NEXT: v_mov_b32_e32 v2, s18
285- ; GCN-NEXT: v_mov_b32_e32 v3, s19
286- ; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
287- ; GCN-NEXT: s_waitcnt vmcnt(0)
288- ; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
257+ ; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
258+ ; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
259+ ; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
260+ ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
261+ ; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
262+ ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
263+ ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
264+ ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
265+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
266+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
267+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
268+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
269+ ; GCN-NEXT: v_mov_b32_e32 v40, s20
270+ ; GCN-NEXT: v_mov_b32_e32 v41, s21
271+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
272+ ; GCN-NEXT: v_mov_b32_e32 v42, s22
273+ ; GCN-NEXT: v_mov_b32_e32 v43, s23
274+ ; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
275+ ; GCN-NEXT: s_waitcnt vmcnt(0)
276+ ; GCN-NEXT: s_nop 2
277+ ; GCN-NEXT: v_mov_b32_e32 v16, s16
278+ ; GCN-NEXT: v_mov_b32_e32 v17, s17
279+ ; GCN-NEXT: v_mov_b32_e32 v18, s18
280+ ; GCN-NEXT: v_mov_b32_e32 v19, s19
281+ ; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
289282; GCN-NEXT: s_waitcnt vmcnt(0)
290283; GCN-NEXT: s_nop 0
291- ; GCN-NEXT: v_mov_b32_e32 v0 , s12
292- ; GCN-NEXT: v_mov_b32_e32 v1 , s13
293- ; GCN-NEXT: v_mov_b32_e32 v2 , s14
294- ; GCN-NEXT: v_mov_b32_e32 v3 , s15
295- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] offset:16 sc0 sc1
284+ ; GCN-NEXT: v_mov_b32_e32 v16 , s12
285+ ; GCN-NEXT: v_mov_b32_e32 v17 , s13
286+ ; GCN-NEXT: v_mov_b32_e32 v18 , s14
287+ ; GCN-NEXT: v_mov_b32_e32 v19 , s15
288+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] offset:16 sc0 sc1
296289; GCN-NEXT: s_waitcnt vmcnt(0)
297290; GCN-NEXT: s_nop 0
298- ; GCN-NEXT: v_mov_b32_e32 v0 , s8
299- ; GCN-NEXT: v_mov_b32_e32 v1 , s9
300- ; GCN-NEXT: v_mov_b32_e32 v2 , s10
301- ; GCN-NEXT: v_mov_b32_e32 v3 , s11
302- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] sc0 sc1
291+ ; GCN-NEXT: v_mov_b32_e32 v16 , s8
292+ ; GCN-NEXT: v_mov_b32_e32 v17 , s9
293+ ; GCN-NEXT: v_mov_b32_e32 v18 , s10
294+ ; GCN-NEXT: v_mov_b32_e32 v19 , s11
295+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] sc0 sc1
303296; GCN-NEXT: s_waitcnt vmcnt(0)
304- ; GCN-NEXT: global_store_dwordx4 v8, a [8:11], s[0:1] offset:32 sc0 sc1
297+ ; GCN-NEXT: global_store_dwordx4 v44, v [8:11], s[0:1] offset:32 sc0 sc1
305298; GCN-NEXT: s_waitcnt vmcnt(0)
306- ; GCN-NEXT: global_store_dwordx4 v8, a [12:15], s[0:1] offset:48 sc0 sc1
299+ ; GCN-NEXT: global_store_dwordx4 v44, v [12:15], s[0:1] offset:48 sc0 sc1
307300; GCN-NEXT: s_waitcnt vmcnt(0)
308- ; GCN-NEXT: global_store_dwordx4 v8, a [0:3], s[0:1] sc0 sc1
301+ ; GCN-NEXT: global_store_dwordx4 v44, v [0:3], s[0:1] sc0 sc1
309302; GCN-NEXT: s_waitcnt vmcnt(0)
310- ; GCN-NEXT: global_store_dwordx4 v8, a [4:7], s[0:1] offset:16 sc0 sc1
303+ ; GCN-NEXT: global_store_dwordx4 v44, v [4:7], s[0:1] offset:16 sc0 sc1
311304; GCN-NEXT: s_waitcnt vmcnt(0)
312305; GCN-NEXT: s_endpgm
313306 %result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 0 , i32 0 , i32 0 )
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
322315; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
323316; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
324317; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
325- ; GCN-NEXT: v_mov_b32_e32 v8 , 0
318+ ; GCN-NEXT: v_mov_b32_e32 v44 , 0
326319; GCN-NEXT: s_waitcnt lgkmcnt(0)
327- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
328- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
329- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
330- ; GCN-NEXT: v_accvgpr_write_b32 a31, s23
331- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
332- ; GCN-NEXT: v_accvgpr_write_b32 a30, s22
333- ; GCN-NEXT: v_accvgpr_write_b32 a29, s21
334- ; GCN-NEXT: v_accvgpr_write_b32 a28, s20
335- ; GCN-NEXT: v_accvgpr_write_b32 a27, s19
336- ; GCN-NEXT: v_accvgpr_write_b32 a26, s18
337- ; GCN-NEXT: v_accvgpr_write_b32 a25, s17
338- ; GCN-NEXT: v_accvgpr_write_b32 a24, s16
339- ; GCN-NEXT: v_accvgpr_write_b32 a23, s15
340- ; GCN-NEXT: v_accvgpr_write_b32 a22, s14
341- ; GCN-NEXT: v_accvgpr_write_b32 a21, s13
342- ; GCN-NEXT: v_accvgpr_write_b32 a20, s12
343- ; GCN-NEXT: v_accvgpr_write_b32 a19, s11
344- ; GCN-NEXT: v_accvgpr_write_b32 a18, s10
345- ; GCN-NEXT: v_accvgpr_write_b32 a17, s9
346- ; GCN-NEXT: v_accvgpr_write_b32 a16, s8
347- ; GCN-NEXT: v_mov_b32_e32 v10, s20
348- ; GCN-NEXT: v_mov_b32_e32 v11, s21
349- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
350- ; GCN-NEXT: v_mov_b32_e32 v12, s22
351- ; GCN-NEXT: v_mov_b32_e32 v13, s23
352- ; GCN-NEXT: v_mov_b32_e32 v0, s16
353- ; GCN-NEXT: v_mov_b32_e32 v1, s17
354- ; GCN-NEXT: v_mov_b32_e32 v2, s18
355- ; GCN-NEXT: v_mov_b32_e32 v3, s19
356- ; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
357- ; GCN-NEXT: s_waitcnt vmcnt(0)
358- ; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
320+ ; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
321+ ; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
322+ ; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
323+ ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
324+ ; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
325+ ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
326+ ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
327+ ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
328+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
329+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
330+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
331+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
332+ ; GCN-NEXT: v_mov_b32_e32 v40, s20
333+ ; GCN-NEXT: v_mov_b32_e32 v41, s21
334+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
335+ ; GCN-NEXT: v_mov_b32_e32 v42, s22
336+ ; GCN-NEXT: v_mov_b32_e32 v43, s23
337+ ; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
338+ ; GCN-NEXT: s_waitcnt vmcnt(0)
339+ ; GCN-NEXT: s_nop 2
340+ ; GCN-NEXT: v_mov_b32_e32 v16, s16
341+ ; GCN-NEXT: v_mov_b32_e32 v17, s17
342+ ; GCN-NEXT: v_mov_b32_e32 v18, s18
343+ ; GCN-NEXT: v_mov_b32_e32 v19, s19
344+ ; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
359345; GCN-NEXT: s_waitcnt vmcnt(0)
360346; GCN-NEXT: s_nop 0
361- ; GCN-NEXT: v_mov_b32_e32 v0 , s12
362- ; GCN-NEXT: v_mov_b32_e32 v1 , s13
363- ; GCN-NEXT: v_mov_b32_e32 v2 , s14
364- ; GCN-NEXT: v_mov_b32_e32 v3 , s15
365- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] offset:16 sc0 sc1
347+ ; GCN-NEXT: v_mov_b32_e32 v16 , s12
348+ ; GCN-NEXT: v_mov_b32_e32 v17 , s13
349+ ; GCN-NEXT: v_mov_b32_e32 v18 , s14
350+ ; GCN-NEXT: v_mov_b32_e32 v19 , s15
351+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] offset:16 sc0 sc1
366352; GCN-NEXT: s_waitcnt vmcnt(0)
367353; GCN-NEXT: s_nop 0
368- ; GCN-NEXT: v_mov_b32_e32 v0 , s8
369- ; GCN-NEXT: v_mov_b32_e32 v1 , s9
370- ; GCN-NEXT: v_mov_b32_e32 v2 , s10
371- ; GCN-NEXT: v_mov_b32_e32 v3 , s11
372- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] sc0 sc1
354+ ; GCN-NEXT: v_mov_b32_e32 v16 , s8
355+ ; GCN-NEXT: v_mov_b32_e32 v17 , s9
356+ ; GCN-NEXT: v_mov_b32_e32 v18 , s10
357+ ; GCN-NEXT: v_mov_b32_e32 v19 , s11
358+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] sc0 sc1
373359; GCN-NEXT: s_waitcnt vmcnt(0)
374- ; GCN-NEXT: global_store_dwordx4 v8, a [8:11], s[0:1] offset:32 sc0 sc1
360+ ; GCN-NEXT: global_store_dwordx4 v44, v [8:11], s[0:1] offset:32 sc0 sc1
375361; GCN-NEXT: s_waitcnt vmcnt(0)
376- ; GCN-NEXT: global_store_dwordx4 v8, a [12:15], s[0:1] offset:48 sc0 sc1
362+ ; GCN-NEXT: global_store_dwordx4 v44, v [12:15], s[0:1] offset:48 sc0 sc1
377363; GCN-NEXT: s_waitcnt vmcnt(0)
378- ; GCN-NEXT: global_store_dwordx4 v8, a [0:3], s[0:1] sc0 sc1
364+ ; GCN-NEXT: global_store_dwordx4 v44, v [0:3], s[0:1] sc0 sc1
379365; GCN-NEXT: s_waitcnt vmcnt(0)
380- ; GCN-NEXT: global_store_dwordx4 v8, a [4:7], s[0:1] offset:16 sc0 sc1
366+ ; GCN-NEXT: global_store_dwordx4 v44, v [4:7], s[0:1] offset:16 sc0 sc1
381367; GCN-NEXT: s_waitcnt vmcnt(0)
382368; GCN-NEXT: s_endpgm
383369 %result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 1 , i32 2 , i32 3 )
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
393379; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
394380; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
395381; GCN-NEXT: s_waitcnt lgkmcnt(0)
396- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
397- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
398- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
399- ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
400- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
401- ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
402- ; GCN-NEXT: v_accvgpr_write_b32 a2, s10
403- ; GCN-NEXT: v_accvgpr_write_b32 a3, s11
404- ; GCN-NEXT: v_accvgpr_write_b32 a4, s12
405- ; GCN-NEXT: v_accvgpr_write_b32 a5, s13
406- ; GCN-NEXT: v_accvgpr_write_b32 a6, s14
407- ; GCN-NEXT: v_accvgpr_write_b32 a7, s15
408- ; GCN-NEXT: v_accvgpr_write_b32 a8, s16
409- ; GCN-NEXT: v_accvgpr_write_b32 a9, s17
410- ; GCN-NEXT: v_accvgpr_write_b32 a10, s18
411- ; GCN-NEXT: v_accvgpr_write_b32 a11, s19
412- ; GCN-NEXT: v_accvgpr_write_b32 a12, s20
413- ; GCN-NEXT: v_accvgpr_write_b32 a13, s21
414- ; GCN-NEXT: v_accvgpr_write_b32 a14, s22
415- ; GCN-NEXT: v_accvgpr_write_b32 a15, s23
382+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
383+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
384+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
385+ ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
386+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
387+ ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
388+ ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
389+ ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
390+ ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
391+ ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
392+ ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
393+ ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
416394; GCN-NEXT: s_nop 1
417- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a [0:15], v[0:3 ], v[4:7 ], a [0:15]
418- ; GCN-NEXT: v_mov_b32_e32 v0 , 0
395+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v [0:15], v[16:19 ], v[20:23 ], v [0:15]
396+ ; GCN-NEXT: v_mov_b32_e32 v16 , 0
419397; GCN-NEXT: s_nop 7
420398; GCN-NEXT: s_nop 2
421- ; GCN-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
422- ; GCN-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
423- ; GCN-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
424- ; GCN-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
399+ ; GCN-NEXT: global_store_dwordx4 v16, v [12:15], s[0:1] offset:48
400+ ; GCN-NEXT: global_store_dwordx4 v16, v [8:11], s[0:1] offset:32
401+ ; GCN-NEXT: global_store_dwordx4 v16, v [4:7], s[0:1] offset:16
402+ ; GCN-NEXT: global_store_dwordx4 v16, v [0:3], s[0:1]
425403; GCN-NEXT: s_endpgm
426404 %result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 0 , i32 0 , i32 0 )
427405 store <16 x float > %result , ptr addrspace (1 ) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
435413; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
436414; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
437415; GCN-NEXT: s_waitcnt lgkmcnt(0)
438- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
439- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
440- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
441- ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
442- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
443- ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
444- ; GCN-NEXT: v_accvgpr_write_b32 a2, s10
445- ; GCN-NEXT: v_accvgpr_write_b32 a3, s11
446- ; GCN-NEXT: v_accvgpr_write_b32 a4, s12
447- ; GCN-NEXT: v_accvgpr_write_b32 a5, s13
448- ; GCN-NEXT: v_accvgpr_write_b32 a6, s14
449- ; GCN-NEXT: v_accvgpr_write_b32 a7, s15
450- ; GCN-NEXT: v_accvgpr_write_b32 a8, s16
451- ; GCN-NEXT: v_accvgpr_write_b32 a9, s17
452- ; GCN-NEXT: v_accvgpr_write_b32 a10, s18
453- ; GCN-NEXT: v_accvgpr_write_b32 a11, s19
454- ; GCN-NEXT: v_accvgpr_write_b32 a12, s20
455- ; GCN-NEXT: v_accvgpr_write_b32 a13, s21
456- ; GCN-NEXT: v_accvgpr_write_b32 a14, s22
457- ; GCN-NEXT: v_accvgpr_write_b32 a15, s23
416+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
417+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
418+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
419+ ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
420+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
421+ ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
422+ ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
423+ ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
424+ ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
425+ ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
426+ ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
427+ ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
458428; GCN-NEXT: s_nop 1
459- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a [0:15], v[0:3 ], v[4:7 ], a [0:15] cbsz:3 abid:2 blgp:1
460- ; GCN-NEXT: v_mov_b32_e32 v0 , 0
429+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v [0:15], v[16:19 ], v[20:23 ], v [0:15] cbsz:3 abid:2 blgp:1
430+ ; GCN-NEXT: v_mov_b32_e32 v16 , 0
461431; GCN-NEXT: s_nop 7
462432; GCN-NEXT: s_nop 2
463- ; GCN-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
464- ; GCN-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
465- ; GCN-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
466- ; GCN-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
433+ ; GCN-NEXT: global_store_dwordx4 v16, v [12:15], s[0:1] offset:48
434+ ; GCN-NEXT: global_store_dwordx4 v16, v [8:11], s[0:1] offset:32
435+ ; GCN-NEXT: global_store_dwordx4 v16, v [4:7], s[0:1] offset:16
436+ ; GCN-NEXT: global_store_dwordx4 v16, v [0:3], s[0:1]
467437; GCN-NEXT: s_endpgm
468438 %result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 3 , i32 2 , i32 1 )
469439 store <16 x float > %result , ptr addrspace (1 ) %out
470440 ret void
471441}
472442
473- attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" }
443+ attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" "amdgpu-agpr-alloc" = "0,0" }
474444attributes #1 = { "amdgpu-flat-work-group-size" ="1,64" }
0 commit comments