@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
252
252
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
253
253
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
254
254
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
255
- ; GCN-NEXT: v_mov_b32_e32 v8 , 0
255
+ ; GCN-NEXT: v_mov_b32_e32 v44 , 0
256
256
; GCN-NEXT: s_waitcnt lgkmcnt(0)
257
- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
258
- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
259
- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
260
- ; GCN-NEXT: v_accvgpr_write_b32 a31, s23
261
- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
262
- ; GCN-NEXT: v_accvgpr_write_b32 a30, s22
263
- ; GCN-NEXT: v_accvgpr_write_b32 a29, s21
264
- ; GCN-NEXT: v_accvgpr_write_b32 a28, s20
265
- ; GCN-NEXT: v_accvgpr_write_b32 a27, s19
266
- ; GCN-NEXT: v_accvgpr_write_b32 a26, s18
267
- ; GCN-NEXT: v_accvgpr_write_b32 a25, s17
268
- ; GCN-NEXT: v_accvgpr_write_b32 a24, s16
269
- ; GCN-NEXT: v_accvgpr_write_b32 a23, s15
270
- ; GCN-NEXT: v_accvgpr_write_b32 a22, s14
271
- ; GCN-NEXT: v_accvgpr_write_b32 a21, s13
272
- ; GCN-NEXT: v_accvgpr_write_b32 a20, s12
273
- ; GCN-NEXT: v_accvgpr_write_b32 a19, s11
274
- ; GCN-NEXT: v_accvgpr_write_b32 a18, s10
275
- ; GCN-NEXT: v_accvgpr_write_b32 a17, s9
276
- ; GCN-NEXT: v_accvgpr_write_b32 a16, s8
277
- ; GCN-NEXT: v_mov_b32_e32 v10, s20
278
- ; GCN-NEXT: v_mov_b32_e32 v11, s21
279
- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
280
- ; GCN-NEXT: v_mov_b32_e32 v12, s22
281
- ; GCN-NEXT: v_mov_b32_e32 v13, s23
282
- ; GCN-NEXT: v_mov_b32_e32 v0, s16
283
- ; GCN-NEXT: v_mov_b32_e32 v1, s17
284
- ; GCN-NEXT: v_mov_b32_e32 v2, s18
285
- ; GCN-NEXT: v_mov_b32_e32 v3, s19
286
- ; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
287
- ; GCN-NEXT: s_waitcnt vmcnt(0)
288
- ; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
257
+ ; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
258
+ ; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
259
+ ; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
260
+ ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
261
+ ; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
262
+ ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
263
+ ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
264
+ ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
265
+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
266
+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
267
+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
268
+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
269
+ ; GCN-NEXT: v_mov_b32_e32 v40, s20
270
+ ; GCN-NEXT: v_mov_b32_e32 v41, s21
271
+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
272
+ ; GCN-NEXT: v_mov_b32_e32 v42, s22
273
+ ; GCN-NEXT: v_mov_b32_e32 v43, s23
274
+ ; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
275
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
276
+ ; GCN-NEXT: s_nop 2
277
+ ; GCN-NEXT: v_mov_b32_e32 v16, s16
278
+ ; GCN-NEXT: v_mov_b32_e32 v17, s17
279
+ ; GCN-NEXT: v_mov_b32_e32 v18, s18
280
+ ; GCN-NEXT: v_mov_b32_e32 v19, s19
281
+ ; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
289
282
; GCN-NEXT: s_waitcnt vmcnt(0)
290
283
; GCN-NEXT: s_nop 0
291
- ; GCN-NEXT: v_mov_b32_e32 v0 , s12
292
- ; GCN-NEXT: v_mov_b32_e32 v1 , s13
293
- ; GCN-NEXT: v_mov_b32_e32 v2 , s14
294
- ; GCN-NEXT: v_mov_b32_e32 v3 , s15
295
- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] offset:16 sc0 sc1
284
+ ; GCN-NEXT: v_mov_b32_e32 v16 , s12
285
+ ; GCN-NEXT: v_mov_b32_e32 v17 , s13
286
+ ; GCN-NEXT: v_mov_b32_e32 v18 , s14
287
+ ; GCN-NEXT: v_mov_b32_e32 v19 , s15
288
+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] offset:16 sc0 sc1
296
289
; GCN-NEXT: s_waitcnt vmcnt(0)
297
290
; GCN-NEXT: s_nop 0
298
- ; GCN-NEXT: v_mov_b32_e32 v0 , s8
299
- ; GCN-NEXT: v_mov_b32_e32 v1 , s9
300
- ; GCN-NEXT: v_mov_b32_e32 v2 , s10
301
- ; GCN-NEXT: v_mov_b32_e32 v3 , s11
302
- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] sc0 sc1
291
+ ; GCN-NEXT: v_mov_b32_e32 v16 , s8
292
+ ; GCN-NEXT: v_mov_b32_e32 v17 , s9
293
+ ; GCN-NEXT: v_mov_b32_e32 v18 , s10
294
+ ; GCN-NEXT: v_mov_b32_e32 v19 , s11
295
+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] sc0 sc1
303
296
; GCN-NEXT: s_waitcnt vmcnt(0)
304
- ; GCN-NEXT: global_store_dwordx4 v8, a [8:11], s[0:1] offset:32 sc0 sc1
297
+ ; GCN-NEXT: global_store_dwordx4 v44, v [8:11], s[0:1] offset:32 sc0 sc1
305
298
; GCN-NEXT: s_waitcnt vmcnt(0)
306
- ; GCN-NEXT: global_store_dwordx4 v8, a [12:15], s[0:1] offset:48 sc0 sc1
299
+ ; GCN-NEXT: global_store_dwordx4 v44, v [12:15], s[0:1] offset:48 sc0 sc1
307
300
; GCN-NEXT: s_waitcnt vmcnt(0)
308
- ; GCN-NEXT: global_store_dwordx4 v8, a [0:3], s[0:1] sc0 sc1
301
+ ; GCN-NEXT: global_store_dwordx4 v44, v [0:3], s[0:1] sc0 sc1
309
302
; GCN-NEXT: s_waitcnt vmcnt(0)
310
- ; GCN-NEXT: global_store_dwordx4 v8, a [4:7], s[0:1] offset:16 sc0 sc1
303
+ ; GCN-NEXT: global_store_dwordx4 v44, v [4:7], s[0:1] offset:16 sc0 sc1
311
304
; GCN-NEXT: s_waitcnt vmcnt(0)
312
305
; GCN-NEXT: s_endpgm
313
306
%result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 0 , i32 0 , i32 0 )
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
322
315
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
323
316
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
324
317
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
325
- ; GCN-NEXT: v_mov_b32_e32 v8 , 0
318
+ ; GCN-NEXT: v_mov_b32_e32 v44 , 0
326
319
; GCN-NEXT: s_waitcnt lgkmcnt(0)
327
- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
328
- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
329
- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
330
- ; GCN-NEXT: v_accvgpr_write_b32 a31, s23
331
- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
332
- ; GCN-NEXT: v_accvgpr_write_b32 a30, s22
333
- ; GCN-NEXT: v_accvgpr_write_b32 a29, s21
334
- ; GCN-NEXT: v_accvgpr_write_b32 a28, s20
335
- ; GCN-NEXT: v_accvgpr_write_b32 a27, s19
336
- ; GCN-NEXT: v_accvgpr_write_b32 a26, s18
337
- ; GCN-NEXT: v_accvgpr_write_b32 a25, s17
338
- ; GCN-NEXT: v_accvgpr_write_b32 a24, s16
339
- ; GCN-NEXT: v_accvgpr_write_b32 a23, s15
340
- ; GCN-NEXT: v_accvgpr_write_b32 a22, s14
341
- ; GCN-NEXT: v_accvgpr_write_b32 a21, s13
342
- ; GCN-NEXT: v_accvgpr_write_b32 a20, s12
343
- ; GCN-NEXT: v_accvgpr_write_b32 a19, s11
344
- ; GCN-NEXT: v_accvgpr_write_b32 a18, s10
345
- ; GCN-NEXT: v_accvgpr_write_b32 a17, s9
346
- ; GCN-NEXT: v_accvgpr_write_b32 a16, s8
347
- ; GCN-NEXT: v_mov_b32_e32 v10, s20
348
- ; GCN-NEXT: v_mov_b32_e32 v11, s21
349
- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
350
- ; GCN-NEXT: v_mov_b32_e32 v12, s22
351
- ; GCN-NEXT: v_mov_b32_e32 v13, s23
352
- ; GCN-NEXT: v_mov_b32_e32 v0, s16
353
- ; GCN-NEXT: v_mov_b32_e32 v1, s17
354
- ; GCN-NEXT: v_mov_b32_e32 v2, s18
355
- ; GCN-NEXT: v_mov_b32_e32 v3, s19
356
- ; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
357
- ; GCN-NEXT: s_waitcnt vmcnt(0)
358
- ; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
320
+ ; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
321
+ ; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
322
+ ; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
323
+ ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
324
+ ; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
325
+ ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
326
+ ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
327
+ ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
328
+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
329
+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
330
+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
331
+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
332
+ ; GCN-NEXT: v_mov_b32_e32 v40, s20
333
+ ; GCN-NEXT: v_mov_b32_e32 v41, s21
334
+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
335
+ ; GCN-NEXT: v_mov_b32_e32 v42, s22
336
+ ; GCN-NEXT: v_mov_b32_e32 v43, s23
337
+ ; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
338
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
339
+ ; GCN-NEXT: s_nop 2
340
+ ; GCN-NEXT: v_mov_b32_e32 v16, s16
341
+ ; GCN-NEXT: v_mov_b32_e32 v17, s17
342
+ ; GCN-NEXT: v_mov_b32_e32 v18, s18
343
+ ; GCN-NEXT: v_mov_b32_e32 v19, s19
344
+ ; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
359
345
; GCN-NEXT: s_waitcnt vmcnt(0)
360
346
; GCN-NEXT: s_nop 0
361
- ; GCN-NEXT: v_mov_b32_e32 v0 , s12
362
- ; GCN-NEXT: v_mov_b32_e32 v1 , s13
363
- ; GCN-NEXT: v_mov_b32_e32 v2 , s14
364
- ; GCN-NEXT: v_mov_b32_e32 v3 , s15
365
- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] offset:16 sc0 sc1
347
+ ; GCN-NEXT: v_mov_b32_e32 v16 , s12
348
+ ; GCN-NEXT: v_mov_b32_e32 v17 , s13
349
+ ; GCN-NEXT: v_mov_b32_e32 v18 , s14
350
+ ; GCN-NEXT: v_mov_b32_e32 v19 , s15
351
+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] offset:16 sc0 sc1
366
352
; GCN-NEXT: s_waitcnt vmcnt(0)
367
353
; GCN-NEXT: s_nop 0
368
- ; GCN-NEXT: v_mov_b32_e32 v0 , s8
369
- ; GCN-NEXT: v_mov_b32_e32 v1 , s9
370
- ; GCN-NEXT: v_mov_b32_e32 v2 , s10
371
- ; GCN-NEXT: v_mov_b32_e32 v3 , s11
372
- ; GCN-NEXT: global_store_dwordx4 v8 , v[0:3 ], s[0:1] sc0 sc1
354
+ ; GCN-NEXT: v_mov_b32_e32 v16 , s8
355
+ ; GCN-NEXT: v_mov_b32_e32 v17 , s9
356
+ ; GCN-NEXT: v_mov_b32_e32 v18 , s10
357
+ ; GCN-NEXT: v_mov_b32_e32 v19 , s11
358
+ ; GCN-NEXT: global_store_dwordx4 v44 , v[16:19 ], s[0:1] sc0 sc1
373
359
; GCN-NEXT: s_waitcnt vmcnt(0)
374
- ; GCN-NEXT: global_store_dwordx4 v8, a [8:11], s[0:1] offset:32 sc0 sc1
360
+ ; GCN-NEXT: global_store_dwordx4 v44, v [8:11], s[0:1] offset:32 sc0 sc1
375
361
; GCN-NEXT: s_waitcnt vmcnt(0)
376
- ; GCN-NEXT: global_store_dwordx4 v8, a [12:15], s[0:1] offset:48 sc0 sc1
362
+ ; GCN-NEXT: global_store_dwordx4 v44, v [12:15], s[0:1] offset:48 sc0 sc1
377
363
; GCN-NEXT: s_waitcnt vmcnt(0)
378
- ; GCN-NEXT: global_store_dwordx4 v8, a [0:3], s[0:1] sc0 sc1
364
+ ; GCN-NEXT: global_store_dwordx4 v44, v [0:3], s[0:1] sc0 sc1
379
365
; GCN-NEXT: s_waitcnt vmcnt(0)
380
- ; GCN-NEXT: global_store_dwordx4 v8, a [4:7], s[0:1] offset:16 sc0 sc1
366
+ ; GCN-NEXT: global_store_dwordx4 v44, v [4:7], s[0:1] offset:16 sc0 sc1
381
367
; GCN-NEXT: s_waitcnt vmcnt(0)
382
368
; GCN-NEXT: s_endpgm
383
369
%result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 1 , i32 2 , i32 3 )
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
393
379
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
394
380
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
395
381
; GCN-NEXT: s_waitcnt lgkmcnt(0)
396
- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
397
- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
398
- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
399
- ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
400
- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
401
- ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
402
- ; GCN-NEXT: v_accvgpr_write_b32 a2, s10
403
- ; GCN-NEXT: v_accvgpr_write_b32 a3, s11
404
- ; GCN-NEXT: v_accvgpr_write_b32 a4, s12
405
- ; GCN-NEXT: v_accvgpr_write_b32 a5, s13
406
- ; GCN-NEXT: v_accvgpr_write_b32 a6, s14
407
- ; GCN-NEXT: v_accvgpr_write_b32 a7, s15
408
- ; GCN-NEXT: v_accvgpr_write_b32 a8, s16
409
- ; GCN-NEXT: v_accvgpr_write_b32 a9, s17
410
- ; GCN-NEXT: v_accvgpr_write_b32 a10, s18
411
- ; GCN-NEXT: v_accvgpr_write_b32 a11, s19
412
- ; GCN-NEXT: v_accvgpr_write_b32 a12, s20
413
- ; GCN-NEXT: v_accvgpr_write_b32 a13, s21
414
- ; GCN-NEXT: v_accvgpr_write_b32 a14, s22
415
- ; GCN-NEXT: v_accvgpr_write_b32 a15, s23
382
+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
383
+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
384
+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
385
+ ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
386
+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
387
+ ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
388
+ ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
389
+ ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
390
+ ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
391
+ ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
392
+ ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
393
+ ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
416
394
; GCN-NEXT: s_nop 1
417
- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a [0:15], v[0:3 ], v[4:7 ], a [0:15]
418
- ; GCN-NEXT: v_mov_b32_e32 v0 , 0
395
+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v [0:15], v[16:19 ], v[20:23 ], v [0:15]
396
+ ; GCN-NEXT: v_mov_b32_e32 v16 , 0
419
397
; GCN-NEXT: s_nop 7
420
398
; GCN-NEXT: s_nop 2
421
- ; GCN-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
422
- ; GCN-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
423
- ; GCN-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
424
- ; GCN-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
399
+ ; GCN-NEXT: global_store_dwordx4 v16, v [12:15], s[0:1] offset:48
400
+ ; GCN-NEXT: global_store_dwordx4 v16, v [8:11], s[0:1] offset:32
401
+ ; GCN-NEXT: global_store_dwordx4 v16, v [4:7], s[0:1] offset:16
402
+ ; GCN-NEXT: global_store_dwordx4 v16, v [0:3], s[0:1]
425
403
; GCN-NEXT: s_endpgm
426
404
%result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 0 , i32 0 , i32 0 )
427
405
store <16 x float > %result , ptr addrspace (1 ) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
435
413
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
436
414
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
437
415
; GCN-NEXT: s_waitcnt lgkmcnt(0)
438
- ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
439
- ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
440
- ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
441
- ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
442
- ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
443
- ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
444
- ; GCN-NEXT: v_accvgpr_write_b32 a2, s10
445
- ; GCN-NEXT: v_accvgpr_write_b32 a3, s11
446
- ; GCN-NEXT: v_accvgpr_write_b32 a4, s12
447
- ; GCN-NEXT: v_accvgpr_write_b32 a5, s13
448
- ; GCN-NEXT: v_accvgpr_write_b32 a6, s14
449
- ; GCN-NEXT: v_accvgpr_write_b32 a7, s15
450
- ; GCN-NEXT: v_accvgpr_write_b32 a8, s16
451
- ; GCN-NEXT: v_accvgpr_write_b32 a9, s17
452
- ; GCN-NEXT: v_accvgpr_write_b32 a10, s18
453
- ; GCN-NEXT: v_accvgpr_write_b32 a11, s19
454
- ; GCN-NEXT: v_accvgpr_write_b32 a12, s20
455
- ; GCN-NEXT: v_accvgpr_write_b32 a13, s21
456
- ; GCN-NEXT: v_accvgpr_write_b32 a14, s22
457
- ; GCN-NEXT: v_accvgpr_write_b32 a15, s23
416
+ ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
417
+ ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
418
+ ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
419
+ ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
420
+ ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
421
+ ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
422
+ ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
423
+ ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
424
+ ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
425
+ ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
426
+ ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
427
+ ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
458
428
; GCN-NEXT: s_nop 1
459
- ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a [0:15], v[0:3 ], v[4:7 ], a [0:15] cbsz:3 abid:2 blgp:1
460
- ; GCN-NEXT: v_mov_b32_e32 v0 , 0
429
+ ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v [0:15], v[16:19 ], v[20:23 ], v [0:15] cbsz:3 abid:2 blgp:1
430
+ ; GCN-NEXT: v_mov_b32_e32 v16 , 0
461
431
; GCN-NEXT: s_nop 7
462
432
; GCN-NEXT: s_nop 2
463
- ; GCN-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
464
- ; GCN-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
465
- ; GCN-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
466
- ; GCN-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
433
+ ; GCN-NEXT: global_store_dwordx4 v16, v [12:15], s[0:1] offset:48
434
+ ; GCN-NEXT: global_store_dwordx4 v16, v [8:11], s[0:1] offset:32
435
+ ; GCN-NEXT: global_store_dwordx4 v16, v [4:7], s[0:1] offset:16
436
+ ; GCN-NEXT: global_store_dwordx4 v16, v [0:3], s[0:1]
467
437
; GCN-NEXT: s_endpgm
468
438
%result = call <16 x float > @llvm.amdgcn.mfma.f32.32x32x16.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <16 x float > %arg2 , i32 3 , i32 2 , i32 1 )
469
439
store <16 x float > %result , ptr addrspace (1 ) %out
470
440
ret void
471
441
}
472
442
473
- attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" }
443
+ attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" "amdgpu-agpr-alloc" = "0,0" }
474
444
attributes #1 = { "amdgpu-flat-work-group-size" ="1,64" }
0 commit comments