@@ -1582,28 +1582,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
15821582; SI-NEXT: s_nop 1
15831583; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
15841584; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1585- ; SI-NEXT: v_readfirstlane_b32 s2, v5
1586- ; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014
1587- ; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01
1588- ; SI-NEXT: s_mov_b32 s1, 0xfffff
1589- ; SI-NEXT: s_mov_b32 s0, s6
1590- ; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
1591- ; SI-NEXT: v_not_b32_e32 v6, s0
1592- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1593- ; SI-NEXT: v_not_b32_e32 v7, s1
1594- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1595- ; SI-NEXT: s_and_b32 s0, s2, 0x80000000
1596- ; SI-NEXT: s_cmp_lt_i32 s3, 0
1597- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1598- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1599- ; SI-NEXT: v_mov_b32_e32 v7, s0
1600- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1601- ; SI-NEXT: s_cmp_gt_i32 s3, 51
1602- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1603- ; SI-NEXT: v_mov_b32_e32 v7, s2
1604- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1605- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1606- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1585+ ; SI-NEXT: v_readfirstlane_b32 s0, v4
1586+ ; SI-NEXT: v_readfirstlane_b32 s1, v5
1587+ ; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
1588+ ; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01
1589+ ; SI-NEXT: s_mov_b32 s3, 0xfffff
1590+ ; SI-NEXT: s_mov_b32 s2, s6
1591+ ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
1592+ ; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
1593+ ; SI-NEXT: s_and_b32 s9, s1, 0x80000000
1594+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
1595+ ; SI-NEXT: s_cselect_b32 s2, 0, s2
1596+ ; SI-NEXT: s_cselect_b32 s3, s9, s3
1597+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
1598+ ; SI-NEXT: s_cselect_b32 s1, s1, s3
1599+ ; SI-NEXT: s_cselect_b32 s0, s0, s2
1600+ ; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
16071601; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
16081602; SI-NEXT: s_endpgm
16091603;
@@ -1859,28 +1853,22 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
18591853; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
18601854; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
18611855; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1862- ; SI-NEXT: v_readfirstlane_b32 s6, v5
1863- ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1864- ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1865- ; SI-NEXT: s_mov_b32 s5, 0xfffff
1866- ; SI-NEXT: s_mov_b32 s4, s2
1867- ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1868- ; SI-NEXT: v_not_b32_e32 v6, s4
1869- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1870- ; SI-NEXT: v_not_b32_e32 v7, s5
1871- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1872- ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1873- ; SI-NEXT: s_cmp_lt_i32 s7, 0
1874- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1875- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1876- ; SI-NEXT: v_mov_b32_e32 v7, s4
1877- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1878- ; SI-NEXT: s_cmp_gt_i32 s7, 51
1879- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1880- ; SI-NEXT: v_mov_b32_e32 v7, s6
1881- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1882- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1883- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1856+ ; SI-NEXT: v_readfirstlane_b32 s4, v4
1857+ ; SI-NEXT: v_readfirstlane_b32 s5, v5
1858+ ; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
1859+ ; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
1860+ ; SI-NEXT: s_mov_b32 s7, 0xfffff
1861+ ; SI-NEXT: s_mov_b32 s6, s2
1862+ ; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
1863+ ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
1864+ ; SI-NEXT: s_and_b32 s9, s5, 0x80000000
1865+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
1866+ ; SI-NEXT: s_cselect_b32 s6, 0, s6
1867+ ; SI-NEXT: s_cselect_b32 s7, s9, s7
1868+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
1869+ ; SI-NEXT: s_cselect_b32 s5, s5, s7
1870+ ; SI-NEXT: s_cselect_b32 s4, s4, s6
1871+ ; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
18841872; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
18851873; SI-NEXT: s_endpgm
18861874;
@@ -2109,28 +2097,22 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
21092097; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
21102098; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
21112099; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
2112- ; SI-NEXT: v_readfirstlane_b32 s6, v5
2113- ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
2114- ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
2115- ; SI-NEXT: s_mov_b32 s5, 0xfffff
2116- ; SI-NEXT: s_mov_b32 s4, s2
2117- ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
2118- ; SI-NEXT: v_not_b32_e32 v6, s4
2119- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
2120- ; SI-NEXT: v_not_b32_e32 v7, s5
2121- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
2122- ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
2123- ; SI-NEXT: s_cmp_lt_i32 s7, 0
2124- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2125- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
2126- ; SI-NEXT: v_mov_b32_e32 v7, s4
2127- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
2128- ; SI-NEXT: s_cmp_gt_i32 s7, 51
2129- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2130- ; SI-NEXT: v_mov_b32_e32 v7, s6
2131- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
2132- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
2133- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
2100+ ; SI-NEXT: v_readfirstlane_b32 s4, v4
2101+ ; SI-NEXT: v_readfirstlane_b32 s5, v5
2102+ ; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
2103+ ; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
2104+ ; SI-NEXT: s_mov_b32 s7, 0xfffff
2105+ ; SI-NEXT: s_mov_b32 s6, s2
2106+ ; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
2107+ ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
2108+ ; SI-NEXT: s_and_b32 s9, s5, 0x80000000
2109+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
2110+ ; SI-NEXT: s_cselect_b32 s6, 0, s6
2111+ ; SI-NEXT: s_cselect_b32 s7, s9, s7
2112+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
2113+ ; SI-NEXT: s_cselect_b32 s5, s5, s7
2114+ ; SI-NEXT: s_cselect_b32 s4, s4, s6
2115+ ; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
21342116; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
21352117; SI-NEXT: s_endpgm
21362118;
@@ -5251,27 +5233,22 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
52515233; SI-NEXT: s_nop 1
52525234; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
52535235; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
5254- ; SI-NEXT: v_readfirstlane_b32 s8, v9
5255- ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
5256- ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
5236+ ; SI-NEXT: v_readfirstlane_b32 s0, v8
5237+ ; SI-NEXT: v_readfirstlane_b32 s1, v9
5238+ ; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
5239+ ; SI-NEXT: s_add_i32 s10, s2, 0xfffffc01
52575240; SI-NEXT: s_mov_b32 s3, 0xfffff
5258- ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
5259- ; SI-NEXT: v_not_b32_e32 v10, s0
5260- ; SI-NEXT: v_and_b32_e32 v10, v8, v10
5261- ; SI-NEXT: v_not_b32_e32 v11, s1
5262- ; SI-NEXT: v_and_b32_e32 v9, v9, v11
5263- ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
5264- ; SI-NEXT: s_cmp_lt_i32 s9, 0
5265- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5266- ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
5267- ; SI-NEXT: v_mov_b32_e32 v11, s0
5268- ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
5269- ; SI-NEXT: s_cmp_gt_i32 s9, 51
5270- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5271- ; SI-NEXT: v_mov_b32_e32 v11, s8
5272- ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
5273- ; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
5274- ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
5241+ ; SI-NEXT: s_mov_b32 s2, s6
5242+ ; SI-NEXT: s_lshr_b64 s[8:9], s[2:3], s10
5243+ ; SI-NEXT: s_andn2_b64 s[8:9], s[0:1], s[8:9]
5244+ ; SI-NEXT: s_and_b32 s11, s1, 0x80000000
5245+ ; SI-NEXT: s_cmp_lt_i32 s10, 0
5246+ ; SI-NEXT: s_cselect_b32 s8, 0, s8
5247+ ; SI-NEXT: s_cselect_b32 s9, s11, s9
5248+ ; SI-NEXT: s_cmp_gt_i32 s10, 51
5249+ ; SI-NEXT: s_cselect_b32 s1, s1, s9
5250+ ; SI-NEXT: s_cselect_b32 s0, s0, s8
5251+ ; SI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3]
52755252; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
52765253; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
52775254; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@@ -5287,26 +5264,20 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
52875264; SI-NEXT: s_nop 1
52885265; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
52895266; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
5290- ; SI-NEXT: v_readfirstlane_b32 s8, v7
5291- ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
5292- ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
5293- ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
5294- ; SI-NEXT: v_not_b32_e32 v8, s0
5295- ; SI-NEXT: v_and_b32_e32 v8, v6, v8
5296- ; SI-NEXT: v_not_b32_e32 v9, s1
5297- ; SI-NEXT: v_and_b32_e32 v7, v7, v9
5298- ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
5299- ; SI-NEXT: s_cmp_lt_i32 s9, 0
5300- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5301- ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
5302- ; SI-NEXT: v_mov_b32_e32 v9, s0
5303- ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
5304- ; SI-NEXT: s_cmp_gt_i32 s9, 51
5305- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5306- ; SI-NEXT: v_mov_b32_e32 v9, s8
5307- ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
5308- ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
5309- ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
5267+ ; SI-NEXT: v_readfirstlane_b32 s0, v6
5268+ ; SI-NEXT: v_readfirstlane_b32 s1, v7
5269+ ; SI-NEXT: s_bfe_u32 s8, s1, 0xb0014
5270+ ; SI-NEXT: s_addk_i32 s8, 0xfc01
5271+ ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
5272+ ; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
5273+ ; SI-NEXT: s_and_b32 s9, s1, 0x80000000
5274+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
5275+ ; SI-NEXT: s_cselect_b32 s2, 0, s2
5276+ ; SI-NEXT: s_cselect_b32 s3, s9, s3
5277+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
5278+ ; SI-NEXT: s_cselect_b32 s1, s1, s3
5279+ ; SI-NEXT: s_cselect_b32 s0, s0, s2
5280+ ; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1]
53105281; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
53115282; SI-NEXT: s_endpgm
53125283;
0 commit comments