@@ -1430,4 +1430,217 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
14301430 ret <16 x float > %result
14311431}
14321432
1433+ ; --------------------------------------------------------------------
1434+ ; llvm.amdgcn.smfmac.i32.16x16x128.i8
1435+ ; --------------------------------------------------------------------
1436+
1437+ declare <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 >, <8 x i32 >, <4 x i32 >, i32 , i32 , i32 )
1438+
1439+ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
1440+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1441+ ; SDAG: ; %bb.0: ; %bb
1442+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1443+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1444+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1445+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1446+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1447+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
1448+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
1449+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1450+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
1451+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
1452+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
1453+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
1454+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
1455+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
1456+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
1457+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
1458+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1459+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
1460+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
1461+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
1462+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
1463+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
1464+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
1465+ ; SDAG-NEXT: s_nop 0
1466+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
1467+ ; SDAG-NEXT: s_nop 6
1468+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
1469+ ; SDAG-NEXT: s_endpgm
1470+ ;
1471+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1472+ ; GISEL: ; %bb.0: ; %bb
1473+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1474+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1475+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1476+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
1477+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1478+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1479+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
1480+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1481+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
1482+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
1483+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
1484+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
1485+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
1486+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
1487+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
1488+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
1489+ ; GISEL-NEXT: s_nop 0
1490+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
1491+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1492+ ; GISEL-NEXT: s_nop 5
1493+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
1494+ ; GISEL-NEXT: s_endpgm
1495+ bb:
1496+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
1497+ %gep = getelementptr <4 x i32 >, ptr addrspace (1 ) %arg , i32 %id
1498+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %gep
1499+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %a , <8 x i32 > %b , <4 x i32 > %in.1 , i32 %idx , i32 1 , i32 2 )
1500+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
1501+ ret void
1502+ }
1503+
1504+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1505+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
1506+ ; SDAG: ; %bb.0:
1507+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1508+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1509+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1510+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1511+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1512+ ; SDAG-NEXT: s_nop 1
1513+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
1514+ ; SDAG-NEXT: s_nop 6
1515+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1516+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1517+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1518+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1519+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1520+ ;
1521+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
1522+ ; GISEL: ; %bb.0:
1523+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1524+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
1525+ ; GISEL-NEXT: s_nop 6
1526+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1527+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1528+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1529+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1530+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1531+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
1532+ ret <4 x i32 > %result
1533+ }
1534+
1535+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1536+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1537+ ; SDAG: ; %bb.0:
1538+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1539+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1540+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1541+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1542+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1543+ ; SDAG-NEXT: s_nop 1
1544+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
1545+ ; SDAG-NEXT: s_nop 6
1546+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1547+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1548+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1549+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1550+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1551+ ;
1552+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1553+ ; GISEL: ; %bb.0:
1554+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1555+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
1556+ ; GISEL-NEXT: s_nop 6
1557+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1558+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1559+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1560+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1561+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1562+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
1563+ ret <4 x i32 > %result
1564+ }
1565+
1566+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1567+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1568+ ; SDAG: ; %bb.0:
1569+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1570+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1571+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1572+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1573+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1574+ ; SDAG-NEXT: s_nop 1
1575+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
1576+ ; SDAG-NEXT: s_nop 6
1577+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1578+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1579+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1580+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1581+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1582+ ;
1583+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1584+ ; GISEL: ; %bb.0:
1585+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1586+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
1587+ ; GISEL-NEXT: s_nop 6
1588+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1589+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1590+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1591+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1592+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1593+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
1594+ ret <4 x i32 > %result
1595+ }
1596+
1597+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x i32 > inreg %arg2 , i32 inreg %arg3 ) {
1598+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1599+ ; SDAG: ; %bb.0:
1600+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1601+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
1602+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
1603+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
1604+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
1605+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
1606+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
1607+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
1608+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
1609+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
1610+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
1611+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
1612+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
1613+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
1614+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
1615+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
1616+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
1617+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
1618+ ; SDAG-NEXT: s_nop 1
1619+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
1620+ ; SDAG-NEXT: s_nop 6
1621+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1622+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1623+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1624+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1625+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1626+ ;
1627+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1628+ ; GISEL: ; %bb.0:
1629+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1630+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
1631+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
1632+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
1633+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
1634+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
1635+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1636+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1637+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
1638+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
1639+ ; GISEL-NEXT: s_nop 1
1640+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
1641+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1642+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
1643+ ret <4 x i32 > %result
1644+ }
1645+
14331646attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments