@@ -1510,4 +1510,243 @@ bb:
15101510 ret void
15111511}
15121512
1513+ define amdgpu_gs void @sgpr_base_large_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1514+ ; GFX9-LABEL: sgpr_base_large_offset:
1515+ ; GFX9: ; %bb.0: ; %entry
1516+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1517+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1518+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1519+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1520+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1521+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1522+ ; GFX9-NEXT: s_endpgm
1523+ ;
1524+ ; GFX10-LABEL: sgpr_base_large_offset:
1525+ ; GFX10: ; %bb.0: ; %entry
1526+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1527+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1528+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1529+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1530+ ; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1531+ ; GFX10-NEXT: scratch_load_dword v2, off, s0
1532+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1533+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1534+ ; GFX10-NEXT: s_endpgm
1535+ ;
1536+ ; GFX940-LABEL: sgpr_base_large_offset:
1537+ ; GFX940: ; %bb.0: ; %entry
1538+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1539+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1540+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1541+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1542+ ; GFX940-NEXT: s_endpgm
1543+ ;
1544+ ; GFX11-LABEL: sgpr_base_large_offset:
1545+ ; GFX11: ; %bb.0: ; %entry
1546+ ; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1547+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0
1548+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1549+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1550+ ; GFX11-NEXT: s_nop 0
1551+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552+ ; GFX11-NEXT: s_endpgm
1553+ ;
1554+ ; GFX12-LABEL: sgpr_base_large_offset:
1555+ ; GFX12: ; %bb.0: ; %entry
1556+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1557+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1558+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1559+ ; GFX12-NEXT: s_nop 0
1560+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1561+ ; GFX12-NEXT: s_endpgm
1562+ entry:
1563+ %large_offset = getelementptr i8 , ptr addrspace (5 ) %sgpr_base , i32 65512
1564+ %load = load i32 , ptr addrspace (5 ) %large_offset , align 4
1565+ store i32 %load , ptr addrspace (1 ) %out
1566+ ret void
1567+ }
1568+
1569+ define amdgpu_gs void @sgpr_base_large_offset_split (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1570+ ; GFX9-LABEL: sgpr_base_large_offset_split:
1571+ ; GFX9: ; %bb.0: ; %entry
1572+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1573+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1574+ ; GFX9-NEXT: s_and_b32 s0, s2, -4
1575+ ; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1576+ ; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1577+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1578+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1579+ ; GFX9-NEXT: s_endpgm
1580+ ;
1581+ ; GFX10-LABEL: sgpr_base_large_offset_split:
1582+ ; GFX10: ; %bb.0: ; %entry
1583+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1584+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1585+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1586+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1587+ ; GFX10-NEXT: s_and_b32 s0, s2, -4
1588+ ; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1589+ ; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1590+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1591+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1592+ ; GFX10-NEXT: s_endpgm
1593+ ;
1594+ ; GFX940-LABEL: sgpr_base_large_offset_split:
1595+ ; GFX940: ; %bb.0: ; %entry
1596+ ; GFX940-NEXT: s_and_b32 s0, s0, -4
1597+ ; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1598+ ; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1599+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1600+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1601+ ; GFX940-NEXT: s_endpgm
1602+ ;
1603+ ; GFX11-LABEL: sgpr_base_large_offset_split:
1604+ ; GFX11: ; %bb.0: ; %entry
1605+ ; GFX11-NEXT: s_and_b32 s0, s0, -4
1606+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1607+ ; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1608+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1609+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1610+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1611+ ; GFX11-NEXT: s_nop 0
1612+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1613+ ; GFX11-NEXT: s_endpgm
1614+ ;
1615+ ; GFX12-LABEL: sgpr_base_large_offset_split:
1616+ ; GFX12: ; %bb.0: ; %entry
1617+ ; GFX12-NEXT: s_and_b32 s0, s0, -4
1618+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1619+ ; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1620+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1621+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1622+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1623+ ; GFX12-NEXT: s_nop 0
1624+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1625+ ; GFX12-NEXT: s_endpgm
1626+ entry:
1627+ ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1628+ %sgpr_base_i32 = ptrtoint ptr addrspace (5 ) %sgpr_base to i32
1629+ %sgpr_base_i32_align4 = and i32 %sgpr_base_i32 , 4294967292
1630+ %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace (5 )
1631+ %split_offset = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base_align4 , i32 0 , i32 16842728
1632+ %load = load volatile i32 , ptr addrspace (5 ) %split_offset , align 4
1633+ store i32 %load , ptr addrspace (1 ) %out
1634+ ret void
1635+ }
1636+
1637+ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset (ptr addrspace (5 ) inreg %sgpr_base , i32 inreg %sidx , i32 %vidx ) {
1638+ ; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1639+ ; GFX9: ; %bb.0: ; %bb
1640+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1641+ ; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1642+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1643+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1644+ ; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1645+ ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1646+ ; GFX9-NEXT: scratch_store_dword v0, v1, off
1647+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1648+ ; GFX9-NEXT: s_endpgm
1649+ ;
1650+ ; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1651+ ; GFX10: ; %bb.0: ; %bb
1652+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1653+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1654+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1655+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1656+ ; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1657+ ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1658+ ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1659+ ; GFX10-NEXT: scratch_store_dword v0, v1, off
1660+ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1661+ ; GFX10-NEXT: s_endpgm
1662+ ;
1663+ ; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1664+ ; GFX940: ; %bb.0: ; %bb
1665+ ; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1666+ ; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1667+ ; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1668+ ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1669+ ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1670+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1671+ ; GFX940-NEXT: s_endpgm
1672+ ;
1673+ ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1674+ ; GFX11: ; %bb.0: ; %bb
1675+ ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1676+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1677+ ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1678+ ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1679+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1680+ ; GFX11-NEXT: s_endpgm
1681+ ;
1682+ ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1683+ ; GFX12: ; %bb.0: ; %bb
1684+ ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1685+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1686+ ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1687+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1688+ ; GFX12-NEXT: s_wait_storecnt 0x0
1689+ ; GFX12-NEXT: s_endpgm
1690+ bb:
1691+ %add1 = add nsw i32 %sidx , %vidx
1692+ %add2 = add nsw i32 %add1 , 65512
1693+ %gep = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base , i32 0 , i32 %add2
1694+ store volatile i32 15 , ptr addrspace (5 ) %gep , align 4
1695+ ret void
1696+ }
1697+
1698+ define amdgpu_gs void @sgpr_base_negative_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %scevgep ) {
1699+ ; GFX9-LABEL: sgpr_base_negative_offset:
1700+ ; GFX9: ; %bb.0: ; %entry
1701+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1702+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1703+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1704+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1705+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1706+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1707+ ; GFX9-NEXT: s_endpgm
1708+ ;
1709+ ; GFX10-LABEL: sgpr_base_negative_offset:
1710+ ; GFX10: ; %bb.0: ; %entry
1711+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1712+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1713+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1714+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1715+ ; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1716+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1717+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1718+ ; GFX10-NEXT: s_endpgm
1719+ ;
1720+ ; GFX940-LABEL: sgpr_base_negative_offset:
1721+ ; GFX940: ; %bb.0: ; %entry
1722+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1723+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1724+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1725+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1726+ ; GFX940-NEXT: s_endpgm
1727+ ;
1728+ ; GFX11-LABEL: sgpr_base_negative_offset:
1729+ ; GFX11: ; %bb.0: ; %entry
1730+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1731+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1732+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1733+ ; GFX11-NEXT: s_nop 0
1734+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1735+ ; GFX11-NEXT: s_endpgm
1736+ ;
1737+ ; GFX12-LABEL: sgpr_base_negative_offset:
1738+ ; GFX12: ; %bb.0: ; %entry
1739+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1740+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1741+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1742+ ; GFX12-NEXT: s_nop 0
1743+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1744+ ; GFX12-NEXT: s_endpgm
1745+ entry:
1746+ %scevgep28 = getelementptr i8 , ptr addrspace (5 ) %scevgep , i32 -24
1747+ %0 = load i32 , ptr addrspace (5 ) %scevgep28 , align 4
1748+ store i32 %0 , ptr addrspace (1 ) %out
1749+ ret void
1750+ }
1751+
15131752declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments