Skip to content

Commit fd996b4

Browse files
committed
add true16 codegen and update test
1 parent 0b4967f commit fd996b4

27 files changed

+3026
-2966
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,10 @@ class VOPSelectPat <ValueType vt> : GCNPat <
12451245
(vt (select i1:$src0, vt:$src1, vt:$src2)),
12461246
(V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
12471247
>;
1248+
class VOPSelectPat_t16 <ValueType vt> : GCNPat <
1249+
(vt (select i1:$src0, vt:$src1, vt:$src2)),
1250+
(V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0)
1251+
>;
12481252
class VOPSelectPat_fake16 <ValueType vt> : GCNPat <
12491253
(vt (select i1:$src0, vt:$src1, vt:$src2)),
12501254
(V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0)
@@ -1256,6 +1260,10 @@ let True16Predicate = NotHasTrue16BitInsts in {
12561260
def : VOPSelectPat <f16>;
12571261
def : VOPSelectPat <i16>;
12581262
} // End True16Predicate = NotHasTrue16BitInsts
1263+
let True16Predicate = UseRealTrue16Insts in {
1264+
def : VOPSelectPat_t16 <f16>;
1265+
def : VOPSelectPat_t16 <i16>;
1266+
} // End True16Predicate = UseRealTrue16Insts
12591267
let True16Predicate = UseFakeTrue16Insts in {
12601268
def : VOPSelectPat_fake16 <f16>;
12611269
def : VOPSelectPat_fake16 <i16>;

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 68 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
52945294
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
52955295
; GFX11-NEXT: s_and_b32 s0, 1, s10
52965296
; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
5297-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5297+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
52985298
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5299-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5299+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
53005300
; GFX11-NEXT: s_and_b32 s1, 1, s1
5301-
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
5302-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5303-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
5304-
; GFX11-NEXT: v_mov_b32_e32 v2, s5
5301+
; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
5302+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
53055303
; GFX11-NEXT: s_ashr_i32 s0, s9, 31
5304+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
5305+
; GFX11-NEXT: v_mov_b32_e32 v2, s5
53065306
; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
53075307
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
53085308
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
54475447
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
54485448
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
54495449
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5450-
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5451-
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5452-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5453-
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5454-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5455-
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5450+
; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1]
5451+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[2:3]
54565452
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5457-
; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5458-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5453+
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
5454+
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[4:5]
5455+
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
5456+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3]
5457+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5458+
; GFX11-NEXT: v_cndmask_b16 v3, v7, v6, vcc_lo
5459+
; GFX11-NEXT: v_cndmask_b16 v2, v2, 0, s0
5460+
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v3
54595461
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5460-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5461-
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
5462-
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54635462
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
5463+
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54645464
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
54655465
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
54665466
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5606,21 +5606,22 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
56065606
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
56075607
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
56085608
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5609-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5609+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[4:5], v[0:1]
56105610
; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
5611+
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5612+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5613+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5614+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[2:3]
5615+
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5616+
; GFX11-NEXT: s_and_b32 s0, 1, s1
56115617
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
5612-
; GFX11-NEXT: s_cselect_b32 s0, 1, 0
5613-
; GFX11-NEXT: s_and_b32 s0, 1, s0
5614-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5615-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5616-
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
56175618
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
5618-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5619-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5619+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5620+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5621+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
56205622
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5621-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
5622-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
56235623
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5624+
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
56245625
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
56255626
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
56265627
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
@@ -5846,33 +5847,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58465847
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
58475848
; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
58485849
; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5849-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5850-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5851-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5852-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5850+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[0:1]
5851+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[10:11]
58535852
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5854-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5855-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5856-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5857-
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
5858-
; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5859-
; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5860-
; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5861-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5862-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5863-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5853+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5854+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[16:17], v[2:3]
5855+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5856+
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5857+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11]
5858+
; GFX11-NEXT: v_add_co_u32 v10, s1, v4, v12
5859+
; GFX11-NEXT: v_add_co_ci_u32_e64 v11, s1, v5, v13, s1
5860+
; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s1, v6, v14, s1
5861+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5862+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
5863+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[10:11], v[4:5]
5864+
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, s1, v7, v15, s1
5865+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15]
58645866
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5865-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5866-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5867-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5868-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5869-
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5870-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5871-
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
5872-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5873-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5867+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
5868+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[12:13], v[6:7]
5869+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
5870+
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
5871+
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13
5872+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
5873+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15]
58745874
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
5875-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5875+
; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo
5876+
; GFX11-NEXT: v_cndmask_b16 v2, v4, 0, s0
58765877
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
58775878
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
58785879
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5882,10 +5883,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58825883
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
58835884
; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
58845885
; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
5885-
; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
5886-
; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
5887-
; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
5888-
; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0
5886+
; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0
5887+
; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v6, s0
5888+
; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s0
5889+
; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, v7, s0
58895890
; GFX11-NEXT: s_setpc_b64 s[30:31]
58905891
%result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
58915892
ret <2 x i128> %result
@@ -6243,16 +6244,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62436244
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
62446245
; GFX11-NEXT: s_and_b32 s0, 1, s18
62456246
; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
6246-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6247+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
62476248
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
6248-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6249+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
62496250
; GFX11-NEXT: s_and_b32 s1, 1, s1
62506251
; GFX11-NEXT: s_ashr_i32 s10, s17, 31
6251-
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
6252+
; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
62526253
; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
6253-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6254-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
6254+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
62556255
; GFX11-NEXT: s_add_u32 s0, s4, s12
6256+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
62566257
; GFX11-NEXT: s_addc_u32 s1, s5, s13
62576258
; GFX11-NEXT: s_addc_u32 s2, s6, s14
62586259
; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6268,17 +6269,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62686269
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
62696270
; GFX11-NEXT: s_and_b32 s4, 1, s12
62706271
; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
6271-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6272+
; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
62726273
; GFX11-NEXT: s_cselect_b32 s5, 1, 0
6273-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6274+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
62746275
; GFX11-NEXT: s_and_b32 s5, 1, s5
6275-
; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
6276-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6277-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
6276+
; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
6277+
; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4
6278+
; GFX11-NEXT: s_ashr_i32 s4, s3, 31
6279+
; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
6280+
; GFX11-NEXT: v_cndmask_b16 v2, v3, 0, s5
62786281
; GFX11-NEXT: v_mov_b32_e32 v3, s8
62796282
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
62806283
; GFX11-NEXT: v_mov_b32_e32 v0, s16
6281-
; GFX11-NEXT: s_ashr_i32 s4, s3, 31
62826284
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
62836285
; GFX11-NEXT: v_mov_b32_e32 v4, s9
62846286
; GFX11-NEXT: v_mov_b32_e32 v2, s17
@@ -6287,7 +6289,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62876289
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
62886290
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
62896291
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6290-
; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
62916292
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
62926293
; GFX11-NEXT: v_mov_b32_e32 v1, s2
62936294
; GFX11-NEXT: v_readfirstlane_b32 s1, v4

0 commit comments

Comments
 (0)