@@ -1582,28 +1582,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
1582
1582
; SI-NEXT: s_nop 1
1583
1583
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
1584
1584
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1585
- ; SI-NEXT: v_readfirstlane_b32 s2, v5
1586
- ; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014
1587
- ; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01
1588
- ; SI-NEXT: s_mov_b32 s1, 0xfffff
1589
- ; SI-NEXT: s_mov_b32 s0, s6
1590
- ; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
1591
- ; SI-NEXT: v_not_b32_e32 v6, s0
1592
- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1593
- ; SI-NEXT: v_not_b32_e32 v7, s1
1594
- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1595
- ; SI-NEXT: s_and_b32 s0, s2, 0x80000000
1596
- ; SI-NEXT: s_cmp_lt_i32 s3, 0
1597
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1598
- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1599
- ; SI-NEXT: v_mov_b32_e32 v7, s0
1600
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1601
- ; SI-NEXT: s_cmp_gt_i32 s3, 51
1602
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1603
- ; SI-NEXT: v_mov_b32_e32 v7, s2
1604
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1605
- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1606
- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1585
+ ; SI-NEXT: v_readfirstlane_b32 s0, v4
1586
+ ; SI-NEXT: v_readfirstlane_b32 s1, v5
1587
+ ; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
1588
+ ; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01
1589
+ ; SI-NEXT: s_mov_b32 s3, 0xfffff
1590
+ ; SI-NEXT: s_mov_b32 s2, s6
1591
+ ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
1592
+ ; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
1593
+ ; SI-NEXT: s_and_b32 s9, s1, 0x80000000
1594
+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
1595
+ ; SI-NEXT: s_cselect_b32 s2, 0, s2
1596
+ ; SI-NEXT: s_cselect_b32 s3, s9, s3
1597
+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
1598
+ ; SI-NEXT: s_cselect_b32 s1, s1, s3
1599
+ ; SI-NEXT: s_cselect_b32 s0, s0, s2
1600
+ ; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
1607
1601
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1608
1602
; SI-NEXT: s_endpgm
1609
1603
;
@@ -1859,28 +1853,22 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
1859
1853
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1860
1854
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1861
1855
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1862
- ; SI-NEXT: v_readfirstlane_b32 s6, v5
1863
- ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1864
- ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1865
- ; SI-NEXT: s_mov_b32 s5, 0xfffff
1866
- ; SI-NEXT: s_mov_b32 s4, s2
1867
- ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1868
- ; SI-NEXT: v_not_b32_e32 v6, s4
1869
- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1870
- ; SI-NEXT: v_not_b32_e32 v7, s5
1871
- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1872
- ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1873
- ; SI-NEXT: s_cmp_lt_i32 s7, 0
1874
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1875
- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1876
- ; SI-NEXT: v_mov_b32_e32 v7, s4
1877
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1878
- ; SI-NEXT: s_cmp_gt_i32 s7, 51
1879
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1880
- ; SI-NEXT: v_mov_b32_e32 v7, s6
1881
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1882
- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1883
- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1856
+ ; SI-NEXT: v_readfirstlane_b32 s4, v4
1857
+ ; SI-NEXT: v_readfirstlane_b32 s5, v5
1858
+ ; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
1859
+ ; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
1860
+ ; SI-NEXT: s_mov_b32 s7, 0xfffff
1861
+ ; SI-NEXT: s_mov_b32 s6, s2
1862
+ ; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
1863
+ ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
1864
+ ; SI-NEXT: s_and_b32 s9, s5, 0x80000000
1865
+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
1866
+ ; SI-NEXT: s_cselect_b32 s6, 0, s6
1867
+ ; SI-NEXT: s_cselect_b32 s7, s9, s7
1868
+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
1869
+ ; SI-NEXT: s_cselect_b32 s5, s5, s7
1870
+ ; SI-NEXT: s_cselect_b32 s4, s4, s6
1871
+ ; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
1884
1872
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1885
1873
; SI-NEXT: s_endpgm
1886
1874
;
@@ -2109,28 +2097,22 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
2109
2097
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
2110
2098
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
2111
2099
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
2112
- ; SI-NEXT: v_readfirstlane_b32 s6, v5
2113
- ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
2114
- ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
2115
- ; SI-NEXT: s_mov_b32 s5, 0xfffff
2116
- ; SI-NEXT: s_mov_b32 s4, s2
2117
- ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
2118
- ; SI-NEXT: v_not_b32_e32 v6, s4
2119
- ; SI-NEXT: v_and_b32_e32 v6, v4, v6
2120
- ; SI-NEXT: v_not_b32_e32 v7, s5
2121
- ; SI-NEXT: v_and_b32_e32 v5, v5, v7
2122
- ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
2123
- ; SI-NEXT: s_cmp_lt_i32 s7, 0
2124
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2125
- ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
2126
- ; SI-NEXT: v_mov_b32_e32 v7, s4
2127
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
2128
- ; SI-NEXT: s_cmp_gt_i32 s7, 51
2129
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2130
- ; SI-NEXT: v_mov_b32_e32 v7, s6
2131
- ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
2132
- ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
2133
- ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
2100
+ ; SI-NEXT: v_readfirstlane_b32 s4, v4
2101
+ ; SI-NEXT: v_readfirstlane_b32 s5, v5
2102
+ ; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
2103
+ ; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
2104
+ ; SI-NEXT: s_mov_b32 s7, 0xfffff
2105
+ ; SI-NEXT: s_mov_b32 s6, s2
2106
+ ; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
2107
+ ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
2108
+ ; SI-NEXT: s_and_b32 s9, s5, 0x80000000
2109
+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
2110
+ ; SI-NEXT: s_cselect_b32 s6, 0, s6
2111
+ ; SI-NEXT: s_cselect_b32 s7, s9, s7
2112
+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
2113
+ ; SI-NEXT: s_cselect_b32 s5, s5, s7
2114
+ ; SI-NEXT: s_cselect_b32 s4, s4, s6
2115
+ ; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
2134
2116
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2135
2117
; SI-NEXT: s_endpgm
2136
2118
;
@@ -5251,27 +5233,22 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
5251
5233
; SI-NEXT: s_nop 1
5252
5234
; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
5253
5235
; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
5254
- ; SI-NEXT: v_readfirstlane_b32 s8, v9
5255
- ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
5256
- ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
5236
+ ; SI-NEXT: v_readfirstlane_b32 s0, v8
5237
+ ; SI-NEXT: v_readfirstlane_b32 s1, v9
5238
+ ; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
5239
+ ; SI-NEXT: s_add_i32 s10, s2, 0xfffffc01
5257
5240
; SI-NEXT: s_mov_b32 s3, 0xfffff
5258
- ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
5259
- ; SI-NEXT: v_not_b32_e32 v10, s0
5260
- ; SI-NEXT: v_and_b32_e32 v10, v8, v10
5261
- ; SI-NEXT: v_not_b32_e32 v11, s1
5262
- ; SI-NEXT: v_and_b32_e32 v9, v9, v11
5263
- ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
5264
- ; SI-NEXT: s_cmp_lt_i32 s9, 0
5265
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5266
- ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
5267
- ; SI-NEXT: v_mov_b32_e32 v11, s0
5268
- ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
5269
- ; SI-NEXT: s_cmp_gt_i32 s9, 51
5270
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5271
- ; SI-NEXT: v_mov_b32_e32 v11, s8
5272
- ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
5273
- ; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
5274
- ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
5241
+ ; SI-NEXT: s_mov_b32 s2, s6
5242
+ ; SI-NEXT: s_lshr_b64 s[8:9], s[2:3], s10
5243
+ ; SI-NEXT: s_andn2_b64 s[8:9], s[0:1], s[8:9]
5244
+ ; SI-NEXT: s_and_b32 s11, s1, 0x80000000
5245
+ ; SI-NEXT: s_cmp_lt_i32 s10, 0
5246
+ ; SI-NEXT: s_cselect_b32 s8, 0, s8
5247
+ ; SI-NEXT: s_cselect_b32 s9, s11, s9
5248
+ ; SI-NEXT: s_cmp_gt_i32 s10, 51
5249
+ ; SI-NEXT: s_cselect_b32 s1, s1, s9
5250
+ ; SI-NEXT: s_cselect_b32 s0, s0, s8
5251
+ ; SI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3]
5275
5252
; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
5276
5253
; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
5277
5254
; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@@ -5287,26 +5264,20 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
5287
5264
; SI-NEXT: s_nop 1
5288
5265
; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
5289
5266
; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
5290
- ; SI-NEXT: v_readfirstlane_b32 s8, v7
5291
- ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
5292
- ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
5293
- ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
5294
- ; SI-NEXT: v_not_b32_e32 v8, s0
5295
- ; SI-NEXT: v_and_b32_e32 v8, v6, v8
5296
- ; SI-NEXT: v_not_b32_e32 v9, s1
5297
- ; SI-NEXT: v_and_b32_e32 v7, v7, v9
5298
- ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
5299
- ; SI-NEXT: s_cmp_lt_i32 s9, 0
5300
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5301
- ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
5302
- ; SI-NEXT: v_mov_b32_e32 v9, s0
5303
- ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
5304
- ; SI-NEXT: s_cmp_gt_i32 s9, 51
5305
- ; SI-NEXT: s_cselect_b64 vcc, -1, 0
5306
- ; SI-NEXT: v_mov_b32_e32 v9, s8
5307
- ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
5308
- ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
5309
- ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
5267
+ ; SI-NEXT: v_readfirstlane_b32 s0, v6
5268
+ ; SI-NEXT: v_readfirstlane_b32 s1, v7
5269
+ ; SI-NEXT: s_bfe_u32 s8, s1, 0xb0014
5270
+ ; SI-NEXT: s_addk_i32 s8, 0xfc01
5271
+ ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
5272
+ ; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
5273
+ ; SI-NEXT: s_and_b32 s9, s1, 0x80000000
5274
+ ; SI-NEXT: s_cmp_lt_i32 s8, 0
5275
+ ; SI-NEXT: s_cselect_b32 s2, 0, s2
5276
+ ; SI-NEXT: s_cselect_b32 s3, s9, s3
5277
+ ; SI-NEXT: s_cmp_gt_i32 s8, 51
5278
+ ; SI-NEXT: s_cselect_b32 s1, s1, s3
5279
+ ; SI-NEXT: s_cselect_b32 s0, s0, s2
5280
+ ; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1]
5310
5281
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5311
5282
; SI-NEXT: s_endpgm
5312
5283
;
0 commit comments