@@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
52945294; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
52955295; GFX11-NEXT: s_and_b32 s0, 1, s10
52965296; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
5297- ; GFX11-NEXT: v_cndmask_b32_e64 v2 , 0, 1, s2
5297+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s0 , 0, s0
52985298; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5299- ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo , 0, s0
5299+ ; GFX11-NEXT: v_cndmask_b32_e64 v2 , 0, 1, s2
53005300; GFX11-NEXT: s_and_b32 s1, 1, s1
5301- ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
5302- ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5303- ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
5304- ; GFX11-NEXT: v_mov_b32_e32 v2, s5
5301+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
5302+ ; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
53055303; GFX11-NEXT: s_ashr_i32 s0, s9, 31
5304+ ; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
5305+ ; GFX11-NEXT: v_mov_b32_e32 v2, s5
53065306; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
53075307; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
53085308; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
54475447; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
54485448; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
54495449; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5450- ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5451- ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5452- ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5453- ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5454- ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5455- ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5450+ ; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1]
5451+ ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[2:3]
54565452; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5457- ; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5458- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5453+ ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
5454+ ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[4:5]
5455+ ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
5456+ ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3]
5457+ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5458+ ; GFX11-NEXT: v_cndmask_b16 v3, v7, v6, vcc_lo
5459+ ; GFX11-NEXT: v_cndmask_b16 v2, v2, 0, s0
5460+ ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v3
54595461; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5460- ; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5461- ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
5462- ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54635462; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
5463+ ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54645464; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
54655465; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
54665466; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5606,21 +5606,22 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
56065606; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
56075607; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
56085608; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5609- ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo , v[4:5], v[0:1]
5609+ ; GFX11-NEXT: v_cmp_lt_u64_e64 s0 , v[4:5], v[0:1]
56105610; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
5611+ ; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5612+ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5613+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5614+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[2:3]
5615+ ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5616+ ; GFX11-NEXT: s_and_b32 s0, 1, s1
56115617; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
5612- ; GFX11-NEXT: s_cselect_b32 s0, 1, 0
5613- ; GFX11-NEXT: s_and_b32 s0, 1, s0
5614- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5615- ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5616- ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
56175618; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
5618- ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5619- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5619+ ; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5620+ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5621+ ; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
56205622; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5621- ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
5622- ; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
56235623; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5624+ ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
56245625; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
56255626; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
56265627; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
@@ -5846,33 +5847,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58465847; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
58475848; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
58485849; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5849- ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5850- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5851- ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5852- ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5850+ ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[0:1]
5851+ ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[10:11]
58535852; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5854- ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5855- ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5856- ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5857- ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
5858- ; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5859- ; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5860- ; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5861- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5862- ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5863- ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5853+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5854+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[16:17], v[2:3]
5855+ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5856+ ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5857+ ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11]
5858+ ; GFX11-NEXT: v_add_co_u32 v10, s1, v4, v12
5859+ ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, s1, v5, v13, s1
5860+ ; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s1, v6, v14, s1
5861+ ; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5862+ ; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
5863+ ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[10:11], v[4:5]
5864+ ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, s1, v7, v15, s1
5865+ ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15]
58645866; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5865- ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5866- ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5867- ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5868- ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5869- ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5870- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5871- ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
5872- ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5873- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5867+ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
5868+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[12:13], v[6:7]
5869+ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
5870+ ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
5871+ ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13
5872+ ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
5873+ ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15]
58745874; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
5875- ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5875+ ; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo
5876+ ; GFX11-NEXT: v_cndmask_b16 v2, v4, 0, s0
58765877; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
58775878; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
58785879; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5882,10 +5883,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58825883; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
58835884; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
58845885; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
5885- ; GFX11-NEXT: v_cndmask_b32_e64 v4, v12 , v6, s0
5886- ; GFX11-NEXT: v_cndmask_b32_e64 v5, v13 , v6, s0
5887- ; GFX11-NEXT: v_cndmask_b32_e64 v6, v18 , v6, s0
5888- ; GFX11-NEXT: v_cndmask_b32_e64 v7, v19 , v7, s0
5886+ ; GFX11-NEXT: v_cndmask_b32_e64 v4, v10 , v6, s0
5887+ ; GFX11-NEXT: v_cndmask_b32_e64 v5, v11 , v6, s0
5888+ ; GFX11-NEXT: v_cndmask_b32_e64 v6, v12 , v6, s0
5889+ ; GFX11-NEXT: v_cndmask_b32_e64 v7, v13 , v7, s0
58895890; GFX11-NEXT: s_setpc_b64 s[30:31]
58905891 %result = call <2 x i128 > @llvm.sadd.sat.v2i128 (<2 x i128 > %lhs , <2 x i128 > %rhs )
58915892 ret <2 x i128 > %result
@@ -6243,16 +6244,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62436244; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
62446245; GFX11-NEXT: s_and_b32 s0, 1, s18
62456246; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
6246- ; GFX11-NEXT: v_cndmask_b32_e64 v2 , 0, 1, s2
6247+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s0 , 0, s0
62476248; GFX11-NEXT: s_cselect_b32 s1, 1, 0
6248- ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo , 0, s0
6249+ ; GFX11-NEXT: v_cndmask_b32_e64 v2 , 0, 1, s2
62496250; GFX11-NEXT: s_and_b32 s1, 1, s1
62506251; GFX11-NEXT: s_ashr_i32 s10, s17, 31
6251- ; GFX11-NEXT: v_cmp_ne_u32_e64 s0 , 0, s1
6252+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s1 , 0, s1
62526253; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
6253- ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6254- ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
6254+ ; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
62556255; GFX11-NEXT: s_add_u32 s0, s4, s12
6256+ ; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
62566257; GFX11-NEXT: s_addc_u32 s1, s5, s13
62576258; GFX11-NEXT: s_addc_u32 s2, s6, s14
62586259; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6268,17 +6269,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62686269; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
62696270; GFX11-NEXT: s_and_b32 s4, 1, s12
62706271; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
6271- ; GFX11-NEXT: v_cndmask_b32_e64 v3 , 0, 1, s6
6272+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s4 , 0, s4
62726273; GFX11-NEXT: s_cselect_b32 s5, 1, 0
6273- ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo , 0, s4
6274+ ; GFX11-NEXT: v_cndmask_b32_e64 v3 , 0, 1, s6
62746275; GFX11-NEXT: s_and_b32 s5, 1, s5
6275- ; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
6276- ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6277- ; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
6276+ ; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
6277+ ; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4
6278+ ; GFX11-NEXT: s_ashr_i32 s4, s3, 31
6279+ ; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
6280+ ; GFX11-NEXT: v_cndmask_b16 v2, v3, 0, s5
62786281; GFX11-NEXT: v_mov_b32_e32 v3, s8
62796282; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
62806283; GFX11-NEXT: v_mov_b32_e32 v0, s16
6281- ; GFX11-NEXT: s_ashr_i32 s4, s3, 31
62826284; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
62836285; GFX11-NEXT: v_mov_b32_e32 v4, s9
62846286; GFX11-NEXT: v_mov_b32_e32 v2, s17
@@ -6287,7 +6289,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62876289; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
62886290; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
62896291; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6290- ; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
62916292; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
62926293; GFX11-NEXT: v_mov_b32_e32 v1, s2
62936294; GFX11-NEXT: v_readfirstlane_b32 s1, v4
0 commit comments