@@ -1513,4 +1513,243 @@ bb:
15131513 ret void
15141514}
15151515
1516+ define amdgpu_gs void @sgpr_base_large_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1517+ ; GFX9-LABEL: sgpr_base_large_offset:
1518+ ; GFX9: ; %bb.0: ; %entry
1519+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1520+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1521+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1522+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1523+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1524+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1525+ ; GFX9-NEXT: s_endpgm
1526+ ;
1527+ ; GFX10-LABEL: sgpr_base_large_offset:
1528+ ; GFX10: ; %bb.0: ; %entry
1529+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1530+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1531+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1532+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1533+ ; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1534+ ; GFX10-NEXT: scratch_load_dword v2, off, s0
1535+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1536+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1537+ ; GFX10-NEXT: s_endpgm
1538+ ;
1539+ ; GFX940-LABEL: sgpr_base_large_offset:
1540+ ; GFX940: ; %bb.0: ; %entry
1541+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1542+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1543+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1544+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1545+ ; GFX940-NEXT: s_endpgm
1546+ ;
1547+ ; GFX11-LABEL: sgpr_base_large_offset:
1548+ ; GFX11: ; %bb.0: ; %entry
1549+ ; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1550+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0
1551+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1552+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1553+ ; GFX11-NEXT: s_nop 0
1554+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1555+ ; GFX11-NEXT: s_endpgm
1556+ ;
1557+ ; GFX12-LABEL: sgpr_base_large_offset:
1558+ ; GFX12: ; %bb.0: ; %entry
1559+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1560+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1561+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1562+ ; GFX12-NEXT: s_nop 0
1563+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1564+ ; GFX12-NEXT: s_endpgm
1565+ entry:
1566+ %large_offset = getelementptr i8 , ptr addrspace (5 ) %sgpr_base , i32 65512
1567+ %load = load i32 , ptr addrspace (5 ) %large_offset , align 4
1568+ store i32 %load , ptr addrspace (1 ) %out
1569+ ret void
1570+ }
1571+
1572+ define amdgpu_gs void @sgpr_base_large_offset_split (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1573+ ; GFX9-LABEL: sgpr_base_large_offset_split:
1574+ ; GFX9: ; %bb.0: ; %entry
1575+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1576+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1577+ ; GFX9-NEXT: s_and_b32 s0, s2, -4
1578+ ; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1579+ ; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1580+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1581+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1582+ ; GFX9-NEXT: s_endpgm
1583+ ;
1584+ ; GFX10-LABEL: sgpr_base_large_offset_split:
1585+ ; GFX10: ; %bb.0: ; %entry
1586+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1587+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1588+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1589+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1590+ ; GFX10-NEXT: s_and_b32 s0, s2, -4
1591+ ; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1592+ ; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1593+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1594+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1595+ ; GFX10-NEXT: s_endpgm
1596+ ;
1597+ ; GFX940-LABEL: sgpr_base_large_offset_split:
1598+ ; GFX940: ; %bb.0: ; %entry
1599+ ; GFX940-NEXT: s_and_b32 s0, s0, -4
1600+ ; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1601+ ; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1602+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1603+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1604+ ; GFX940-NEXT: s_endpgm
1605+ ;
1606+ ; GFX11-LABEL: sgpr_base_large_offset_split:
1607+ ; GFX11: ; %bb.0: ; %entry
1608+ ; GFX11-NEXT: s_and_b32 s0, s0, -4
1609+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1610+ ; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1611+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1612+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1613+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1614+ ; GFX11-NEXT: s_nop 0
1615+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1616+ ; GFX11-NEXT: s_endpgm
1617+ ;
1618+ ; GFX12-LABEL: sgpr_base_large_offset_split:
1619+ ; GFX12: ; %bb.0: ; %entry
1620+ ; GFX12-NEXT: s_and_b32 s0, s0, -4
1621+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1622+ ; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1623+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1624+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1625+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1626+ ; GFX12-NEXT: s_nop 0
1627+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1628+ ; GFX12-NEXT: s_endpgm
1629+ entry:
1630+ ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1631+ %sgpr_base_i32 = ptrtoint ptr addrspace (5 ) %sgpr_base to i32
1632+ %sgpr_base_i32_align4 = and i32 %sgpr_base_i32 , 4294967292
1633+ %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace (5 )
1634+ %split_offset = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base_align4 , i32 0 , i32 16842728
1635+ %load = load volatile i32 , ptr addrspace (5 ) %split_offset , align 4
1636+ store i32 %load , ptr addrspace (1 ) %out
1637+ ret void
1638+ }
1639+
1640+ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset (ptr addrspace (5 ) inreg %sgpr_base , i32 inreg %sidx , i32 %vidx ) {
1641+ ; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1642+ ; GFX9: ; %bb.0: ; %bb
1643+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1644+ ; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1645+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1646+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1647+ ; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1648+ ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1649+ ; GFX9-NEXT: scratch_store_dword v0, v1, off
1650+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1651+ ; GFX9-NEXT: s_endpgm
1652+ ;
1653+ ; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1654+ ; GFX10: ; %bb.0: ; %bb
1655+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1656+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1657+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1658+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1659+ ; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1660+ ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1661+ ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1662+ ; GFX10-NEXT: scratch_store_dword v0, v1, off
1663+ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1664+ ; GFX10-NEXT: s_endpgm
1665+ ;
1666+ ; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1667+ ; GFX940: ; %bb.0: ; %bb
1668+ ; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1669+ ; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1670+ ; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1671+ ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1672+ ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1673+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1674+ ; GFX940-NEXT: s_endpgm
1675+ ;
1676+ ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1677+ ; GFX11: ; %bb.0: ; %bb
1678+ ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1679+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1680+ ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1681+ ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1682+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1683+ ; GFX11-NEXT: s_endpgm
1684+ ;
1685+ ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1686+ ; GFX12: ; %bb.0: ; %bb
1687+ ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1688+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1689+ ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1690+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1691+ ; GFX12-NEXT: s_wait_storecnt 0x0
1692+ ; GFX12-NEXT: s_endpgm
1693+ bb:
1694+ %add1 = add nsw i32 %sidx , %vidx
1695+ %add2 = add nsw i32 %add1 , 65512
1696+ %gep = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base , i32 0 , i32 %add2
1697+ store volatile i32 15 , ptr addrspace (5 ) %gep , align 4
1698+ ret void
1699+ }
1700+
1701+ define amdgpu_gs void @sgpr_base_negative_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %scevgep ) {
1702+ ; GFX9-LABEL: sgpr_base_negative_offset:
1703+ ; GFX9: ; %bb.0: ; %entry
1704+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1705+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1706+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1707+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1708+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1709+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1710+ ; GFX9-NEXT: s_endpgm
1711+ ;
1712+ ; GFX10-LABEL: sgpr_base_negative_offset:
1713+ ; GFX10: ; %bb.0: ; %entry
1714+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1715+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1716+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1717+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1718+ ; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1719+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1720+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1721+ ; GFX10-NEXT: s_endpgm
1722+ ;
1723+ ; GFX940-LABEL: sgpr_base_negative_offset:
1724+ ; GFX940: ; %bb.0: ; %entry
1725+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1726+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1727+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1728+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1729+ ; GFX940-NEXT: s_endpgm
1730+ ;
1731+ ; GFX11-LABEL: sgpr_base_negative_offset:
1732+ ; GFX11: ; %bb.0: ; %entry
1733+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1734+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1735+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1736+ ; GFX11-NEXT: s_nop 0
1737+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1738+ ; GFX11-NEXT: s_endpgm
1739+ ;
1740+ ; GFX12-LABEL: sgpr_base_negative_offset:
1741+ ; GFX12: ; %bb.0: ; %entry
1742+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1743+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1744+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1745+ ; GFX12-NEXT: s_nop 0
1746+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1747+ ; GFX12-NEXT: s_endpgm
1748+ entry:
1749+ %scevgep28 = getelementptr i8 , ptr addrspace (5 ) %scevgep , i32 -24
1750+ %0 = load i32 , ptr addrspace (5 ) %scevgep28 , align 4
1751+ store i32 %0 , ptr addrspace (1 ) %out
1752+ ret void
1753+ }
1754+
15161755declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments