@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
13151315 ret void
13161316}
13171317
1318+ ; --------------------------------------------------------------------
1319+ ; llvm.amdgcn.mfma.f32.16x16x32.bf16
1320+ ; --------------------------------------------------------------------
1321+
1322+ declare <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat>, <8 x bfloat>, <4 x float >, i32 immarg, i32 immarg, i32 immarg)
1323+
1324+ define <4 x float > @test_mfma_f32_16x16x32_bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) {
1325+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
1326+ ; SDAG: ; %bb.0:
1327+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1329+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1330+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1331+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1332+ ; SDAG-NEXT: s_nop 1
1333+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1334+ ; SDAG-NEXT: s_nop 6
1335+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1336+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1337+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1338+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1339+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1340+ ;
1341+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
1342+ ; GISEL: ; %bb.0:
1343+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
1345+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
1346+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1347+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1348+ ; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1349+ ; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1350+ ; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1351+ ; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1352+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
1353+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
1354+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
1355+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
1356+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1357+ ; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1358+ ; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1359+ ; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1360+ ; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1361+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1362+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1363+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1364+ ; GISEL-NEXT: s_nop 1
1365+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1366+ ; GISEL-NEXT: s_nop 6
1367+ ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1368+ ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1369+ ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1370+ ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1371+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1372+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
1373+ ret <4 x float > %result
1374+ }
1375+
1376+ define <4 x float > @test_mfma_f32_16x16x32_bf16__flags (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) {
1377+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
1378+ ; SDAG: ; %bb.0:
1379+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1381+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1382+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1383+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1384+ ; SDAG-NEXT: s_nop 1
1385+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1386+ ; SDAG-NEXT: s_nop 6
1387+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1388+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1389+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1390+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1391+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1392+ ;
1393+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
1394+ ; GISEL: ; %bb.0:
1395+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1396+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
1397+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
1398+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1399+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1400+ ; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1401+ ; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1402+ ; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1403+ ; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1404+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
1405+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
1406+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
1407+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
1408+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1409+ ; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1410+ ; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1411+ ; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1412+ ; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1413+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1414+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1415+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1416+ ; GISEL-NEXT: s_nop 1
1417+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1418+ ; GISEL-NEXT: s_nop 6
1419+ ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1420+ ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1421+ ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1422+ ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1423+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1424+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 1 , i32 1 , i32 1 )
1425+ ret <4 x float > %result
1426+ }
1427+
1428+ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd (ptr addrspace (1 ) %out , <8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) #0 {
1429+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
1430+ ; SDAG: ; %bb.0:
1431+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1432+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1433+ ; SDAG-NEXT: v_mov_b32_e32 v12, 0
1434+ ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1435+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1436+ ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1437+ ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1438+ ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1439+ ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1440+ ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1441+ ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1442+ ; SDAG-NEXT: s_nop 1
1443+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
1444+ ; SDAG-NEXT: s_nop 6
1445+ ; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
1446+ ; SDAG-NEXT: s_endpgm
1447+ ;
1448+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
1449+ ; GISEL: ; %bb.0:
1450+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1451+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1452+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1453+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1454+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1455+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1456+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1457+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1458+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1459+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1460+ ; GISEL-NEXT: s_nop 1
1461+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
1462+ ; GISEL-NEXT: v_mov_b32_e32 v4, 0
1463+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1464+ ; GISEL-NEXT: s_nop 4
1465+ ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1466+ ; GISEL-NEXT: s_endpgm
1467+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
1468+ store <4 x float > %result , ptr addrspace (1 ) %out
1469+ ret void
1470+ }
1471+
1472+ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags (ptr addrspace (1 ) %out , <8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) #0 {
1473+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
1474+ ; SDAG: ; %bb.0:
1475+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1476+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1477+ ; SDAG-NEXT: v_mov_b32_e32 v12, 0
1478+ ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1479+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1480+ ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1481+ ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1482+ ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1483+ ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1484+ ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1485+ ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1486+ ; SDAG-NEXT: s_nop 1
1487+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
1488+ ; SDAG-NEXT: s_nop 6
1489+ ; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
1490+ ; SDAG-NEXT: s_endpgm
1491+ ;
1492+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
1493+ ; GISEL: ; %bb.0:
1494+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1495+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1496+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1497+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1498+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1499+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1500+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1501+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1502+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1503+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1504+ ; GISEL-NEXT: s_nop 1
1505+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
1506+ ; GISEL-NEXT: v_mov_b32_e32 v4, 0
1507+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1508+ ; GISEL-NEXT: s_nop 4
1509+ ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1510+ ; GISEL-NEXT: s_endpgm
1511+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 3 , i32 2 , i32 1 )
1512+ store <4 x float > %result , ptr addrspace (1 ) %out
1513+ ret void
1514+ }
1515+
13181516attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" }
13191517attributes #1 = { "amdgpu-flat-work-group-size" ="1,64" }
0 commit comments