@@ -1395,7 +1395,7 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
13951395; CI-LABEL: lshr_mad_i64_2:
13961396; CI: ; %bb.0:
13971397; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1398- ; CI-NEXT: s_movk_i32 s4, 0xfc88
1398+ ; CI-NEXT: s_movk_i32 s4, 0xd1
13991399; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
14001400; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
14011401; CI-NEXT: v_mov_b32_e32 v0, v2
@@ -1404,7 +1404,7 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
14041404; SI-LABEL: lshr_mad_i64_2:
14051405; SI: ; %bb.0:
14061406; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1407- ; SI-NEXT: s_movk_i32 s4, 0xfc88
1407+ ; SI-NEXT: s_movk_i32 s4, 0xd1
14081408; SI-NEXT: v_mul_hi_u32 v2, v1, s4
14091409; SI-NEXT: v_mul_lo_u32 v3, v1, s4
14101410; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
@@ -1415,7 +1415,7 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
14151415; GFX9-LABEL: lshr_mad_i64_2:
14161416; GFX9: ; %bb.0:
14171417; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1418- ; GFX9-NEXT: s_movk_i32 s4, 0xfc88
1418+ ; GFX9-NEXT: s_movk_i32 s4, 0xd1
14191419; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
14201420; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
14211421; GFX9-NEXT: v_mov_b32_e32 v0, v2
@@ -1424,7 +1424,7 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
14241424; GFX11-LABEL: lshr_mad_i64_2:
14251425; GFX11: ; %bb.0:
14261426; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1427- ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88 , v1, v[0:1]
1427+ ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1 , v1, v[0:1]
14281428; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14291429; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
14301430; GFX11-NEXT: v_mov_b32_e32 v0, v2
@@ -1437,6 +1437,64 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
14371437; GFX12-NEXT: s_wait_samplecnt 0x0
14381438; GFX12-NEXT: s_wait_bvhcnt 0x0
14391439; GFX12-NEXT: s_wait_kmcnt 0x0
1440+ ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
1441+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1442+ ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
1443+ ; GFX12-NEXT: v_mov_b32_e32 v0, v2
1444+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
1445+ %lsh = lshr i64 %arg0 , 32
1446+ %mul = mul i64 %lsh , s0xffffffff000000d1
1447+ %mad = add i64 %mul , %arg0
1448+
1449+ ret i64 %mad
1450+ }
1451+
1452+ define i64 @lshr_mad_i64_3 (i64 %arg0 ) #0 {
1453+ ; CI-LABEL: lshr_mad_i64_3:
1454+ ; CI: ; %bb.0:
1455+ ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1456+ ; CI-NEXT: s_movk_i32 s4, 0xfc88
1457+ ; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
1458+ ; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
1459+ ; CI-NEXT: v_mov_b32_e32 v0, v2
1460+ ; CI-NEXT: s_setpc_b64 s[30:31]
1461+ ;
1462+ ; SI-LABEL: lshr_mad_i64_3:
1463+ ; SI: ; %bb.0:
1464+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1465+ ; SI-NEXT: s_movk_i32 s4, 0xfc88
1466+ ; SI-NEXT: v_mul_hi_u32 v2, v1, s4
1467+ ; SI-NEXT: v_mul_lo_u32 v3, v1, s4
1468+ ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
1469+ ; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
1470+ ; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
1471+ ; SI-NEXT: s_setpc_b64 s[30:31]
1472+ ;
1473+ ; GFX9-LABEL: lshr_mad_i64_3:
1474+ ; GFX9: ; %bb.0:
1475+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1476+ ; GFX9-NEXT: s_movk_i32 s4, 0xfc88
1477+ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
1478+ ; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
1479+ ; GFX9-NEXT: v_mov_b32_e32 v0, v2
1480+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
1481+ ;
1482+ ; GFX11-LABEL: lshr_mad_i64_3:
1483+ ; GFX11: ; %bb.0:
1484+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1485+ ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
1486+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1487+ ; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
1488+ ; GFX11-NEXT: v_mov_b32_e32 v0, v2
1489+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
1490+ ;
1491+ ; GFX12-LABEL: lshr_mad_i64_3:
1492+ ; GFX12: ; %bb.0:
1493+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1494+ ; GFX12-NEXT: s_wait_expcnt 0x0
1495+ ; GFX12-NEXT: s_wait_samplecnt 0x0
1496+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
1497+ ; GFX12-NEXT: s_wait_kmcnt 0x0
14401498; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
14411499; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14421500; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
@@ -1449,8 +1507,8 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
14491507 ret i64 %mad
14501508}
14511509
1452- define i64 @lshr_mad_i64_3 (i32 %arg0 , i64 %arg1 ) #0 {
1453- ; CI-LABEL: lshr_mad_i64_3 :
1510+ define i64 @lshr_mad_i64_4 (i32 %arg0 , i64 %arg1 ) #0 {
1511+ ; CI-LABEL: lshr_mad_i64_4 :
14541512; CI: ; %bb.0:
14551513; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14561514; CI-NEXT: v_mul_lo_u32 v3, v2, v0
@@ -1461,7 +1519,7 @@ define i64 @lshr_mad_i64_3(i32 %arg0, i64 %arg1) #0 {
14611519; CI-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
14621520; CI-NEXT: s_setpc_b64 s[30:31]
14631521;
1464- ; SI-LABEL: lshr_mad_i64_3 :
1522+ ; SI-LABEL: lshr_mad_i64_4 :
14651523; SI: ; %bb.0:
14661524; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14671525; SI-NEXT: v_mul_lo_u32 v2, v2, v0
@@ -1476,7 +1534,7 @@ define i64 @lshr_mad_i64_3(i32 %arg0, i64 %arg1) #0 {
14761534; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
14771535; SI-NEXT: s_setpc_b64 s[30:31]
14781536;
1479- ; GFX9-LABEL: lshr_mad_i64_3 :
1537+ ; GFX9-LABEL: lshr_mad_i64_4 :
14801538; GFX9: ; %bb.0:
14811539; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14821540; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
@@ -1488,7 +1546,7 @@ define i64 @lshr_mad_i64_3(i32 %arg0, i64 %arg1) #0 {
14881546; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
14891547; GFX9-NEXT: s_setpc_b64 s[30:31]
14901548;
1491- ; GFX11-LABEL: lshr_mad_i64_3 :
1549+ ; GFX11-LABEL: lshr_mad_i64_4 :
14921550; GFX11: ; %bb.0:
14931551; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14941552; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0
@@ -1502,7 +1560,7 @@ define i64 @lshr_mad_i64_3(i32 %arg0, i64 %arg1) #0 {
15021560; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v5
15031561; GFX11-NEXT: s_setpc_b64 s[30:31]
15041562;
1505- ; GFX12-LABEL: lshr_mad_i64_3 :
1563+ ; GFX12-LABEL: lshr_mad_i64_4 :
15061564; GFX12: ; %bb.0:
15071565; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
15081566; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1800,72 +1858,57 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
18001858 ret i64 %mad
18011859}
18021860
1803- define i64 @lshr_mad_i64_sgpr (i64 inreg %arg0 ) #0 {
1861+ define amdgpu_ps i64 @lshr_mad_i64_sgpr (i64 inreg %arg0 ) #0 {
18041862; CI-LABEL: lshr_mad_i64_sgpr:
18051863; CI: ; %bb.0:
1806- ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1807- ; CI-NEXT: v_mov_b32_e32 v0, s16
1864+ ; CI-NEXT: v_mov_b32_e32 v0, s0
18081865; CI-NEXT: v_mov_b32_e32 v2, 0xffff1c18
1809- ; CI-NEXT: v_mov_b32_e32 v1, s17
1810- ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s17, v2, v[0:1]
1811- ; CI-NEXT: v_subrev_i32_e32 v1, vcc, s17, v1
1812- ; CI-NEXT: s_setpc_b64 s[30:31]
1866+ ; CI-NEXT: v_mov_b32_e32 v1, s1
1867+ ; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
1868+ ; CI-NEXT: v_subrev_i32_e32 v1, vcc, s1, v1
1869+ ; CI-NEXT: v_readfirstlane_b32 s0, v0
1870+ ; CI-NEXT: v_readfirstlane_b32 s1, v1
1871+ ; CI-NEXT: ; return to shader part epilog
18131872;
18141873; SI-LABEL: lshr_mad_i64_sgpr:
18151874; SI: ; %bb.0:
1816- ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18171875; SI-NEXT: v_mov_b32_e32 v0, 0xffff1c18
1818- ; SI-NEXT: v_mul_hi_u32 v0, s17, v0
1819- ; SI-NEXT: s_mul_i32 s4, s17, 0xffff1c18
1820- ; SI-NEXT: v_mov_b32_e32 v2, s17
1821- ; SI-NEXT: v_subrev_i32_e32 v1, vcc, s17, v0
1822- ; SI-NEXT: v_mov_b32_e32 v0, s4
1823- ; SI-NEXT: v_add_i32_e32 v0, vcc, s16, v0
1824- ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
1825- ; SI-NEXT: s_setpc_b64 s[30:31]
1876+ ; SI-NEXT: v_mul_hi_u32 v0, s1, v0
1877+ ; SI-NEXT: s_mul_i32 s2, s1, 0xffff1c18
1878+ ; SI-NEXT: v_readfirstlane_b32 s3, v0
1879+ ; SI-NEXT: s_sub_i32 s3, s3, s1
1880+ ; SI-NEXT: s_add_u32 s0, s2, s0
1881+ ; SI-NEXT: s_addc_u32 s1, s3, s1
1882+ ; SI-NEXT: ; return to shader part epilog
18261883;
18271884; GFX9-LABEL: lshr_mad_i64_sgpr:
18281885; GFX9: ; %bb.0:
1829- ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1830- ; GFX9-NEXT: s_mul_hi_u32 s4, s17, 0xffff1c18
1831- ; GFX9-NEXT: s_sub_i32 s4, s4, s17
1832- ; GFX9-NEXT: s_mul_i32 s5, s17, 0xffff1c18
1833- ; GFX9-NEXT: s_add_u32 s5, s5, s16
1834- ; GFX9-NEXT: s_addc_u32 s4, s4, s17
1835- ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1836- ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1837- ; GFX9-NEXT: s_setpc_b64 s[30:31]
1886+ ; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18
1887+ ; GFX9-NEXT: s_sub_i32 s2, s2, s1
1888+ ; GFX9-NEXT: s_mul_i32 s3, s1, 0xffff1c18
1889+ ; GFX9-NEXT: s_add_u32 s0, s3, s0
1890+ ; GFX9-NEXT: s_addc_u32 s1, s2, s1
1891+ ; GFX9-NEXT: ; return to shader part epilog
18381892;
18391893; GFX11-LABEL: lshr_mad_i64_sgpr:
18401894; GFX11: ; %bb.0:
1841- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18421895; GFX11-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18
18431896; GFX11-NEXT: s_mul_i32 s3, s1, 0xffff1c18
18441897; GFX11-NEXT: s_sub_i32 s2, s2, s1
18451898; GFX11-NEXT: s_add_u32 s0, s3, s0
18461899; GFX11-NEXT: s_addc_u32 s1, s2, s1
1847- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1848- ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1849- ; GFX11-NEXT: s_setpc_b64 s[30:31]
1900+ ; GFX11-NEXT: ; return to shader part epilog
18501901;
18511902; GFX12-LABEL: lshr_mad_i64_sgpr:
18521903; GFX12: ; %bb.0:
1853- ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1854- ; GFX12-NEXT: s_wait_expcnt 0x0
1855- ; GFX12-NEXT: s_wait_samplecnt 0x0
1856- ; GFX12-NEXT: s_wait_bvhcnt 0x0
1857- ; GFX12-NEXT: s_wait_kmcnt 0x0
18581904; GFX12-NEXT: s_mov_b32 s4, 0xffff1c18
18591905; GFX12-NEXT: s_mov_b32 s3, 0
18601906; GFX12-NEXT: s_mov_b32 s2, s1
18611907; GFX12-NEXT: s_mov_b32 s5, -1
1862- ; GFX12-NEXT: s_wait_alu 0xfffe
1908+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
18631909; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
1864- ; GFX12-NEXT: s_wait_alu 0xfffe
18651910; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
1866- ; GFX12-NEXT: s_wait_alu 0xfffe
1867- ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1868- ; GFX12-NEXT: s_setpc_b64 s[30:31]
1911+ ; GFX12-NEXT: ; return to shader part epilog
18691912 %lsh = lshr i64 %arg0 , 32
18701913 %mul = mul i64 %lsh , s0xffffffffffff1c18
18711914 %mad = add i64 %mul , %arg0
0 commit comments