Skip to content

Commit 83cdd94

Browse files
committed
True16 for v_cndmask_b16 in MC
1 parent 81dcbef commit 83cdd94

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+4397
-3486
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3007,8 +3007,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
30073007
switch (I.getOpcode()) {
30083008
case AMDGPU::V_ADDC_U32_e32:
30093009
case AMDGPU::V_ADDC_U32_dpp:
3010-
case AMDGPU::V_CNDMASK_B16_e32:
3011-
case AMDGPU::V_CNDMASK_B16_dpp:
3010+
case AMDGPU::V_CNDMASK_B16_fake16_e32:
3011+
case AMDGPU::V_CNDMASK_B16_fake16_dpp:
30123012
case AMDGPU::V_CNDMASK_B32_e32:
30133013
case AMDGPU::V_CNDMASK_B32_dpp:
30143014
case AMDGPU::V_DIV_FMAS_F32_e64:
@@ -3023,8 +3023,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
30233023
HazardReg == AMDGPU::VCC_HI;
30243024
case AMDGPU::V_ADDC_U32_e64:
30253025
case AMDGPU::V_ADDC_U32_e64_dpp:
3026-
case AMDGPU::V_CNDMASK_B16_e64:
3027-
case AMDGPU::V_CNDMASK_B16_e64_dpp:
3026+
case AMDGPU::V_CNDMASK_B16_fake16_e64:
3027+
case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
30283028
case AMDGPU::V_CNDMASK_B32_e64:
30293029
case AMDGPU::V_CNDMASK_B32_e64_dpp:
30303030
case AMDGPU::V_SUBB_U32_e64:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,11 +1245,29 @@ class VOPSelectPat <ValueType vt> : GCNPat <
12451245
(vt (select i1:$src0, vt:$src1, vt:$src2)),
12461246
(V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
12471247
>;
1248+
class VOPSelectPat_t16 <ValueType vt> : GCNPat <
1249+
(vt (select i1:$src0, vt:$src1, vt:$src2)),
1250+
(V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0)
1251+
>;
1252+
class VOPSelectPat_fake16 <ValueType vt> : GCNPat <
1253+
(vt (select i1:$src0, vt:$src1, vt:$src2)),
1254+
(V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0)
1255+
>;
12481256

12491257
def : VOPSelectModsPat <i32>;
12501258
def : VOPSelectModsPat <f32>;
1251-
def : VOPSelectPat <f16>;
1252-
def : VOPSelectPat <i16>;
1259+
let True16Predicate = NotHasTrue16BitInsts in {
1260+
def : VOPSelectPat <f16>;
1261+
def : VOPSelectPat <i16>;
1262+
} // End True16Predicate = NotHasTrue16BitInsts
1263+
let True16Predicate = UseRealTrue16Insts in {
1264+
def : VOPSelectPat_t16 <f16>;
1265+
def : VOPSelectPat_t16 <i16>;
1266+
} // End True16Predicate = UseRealTrue16Insts
1267+
let True16Predicate = UseFakeTrue16Insts in {
1268+
def : VOPSelectPat_fake16 <f16>;
1269+
def : VOPSelectPat_fake16 <i16>;
1270+
} // End True16Predicate = UseFakeTrue16Insts
12531271

12541272
let AddedComplexity = 1 in {
12551273
def : GCNPat <

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,26 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
714714
def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
715715
def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
716716
// V_CNDMASK_B16 is VOP3 only
717+
def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
718+
let IsTrue16 = 1;
719+
let IsRealTrue16 = 1;
720+
let HasOpSel = 1;
721+
let DstRC64 = getVALUDstForVT<DstVT, 1, 1>.ret;
722+
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
723+
let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
724+
let Src2RC64 = getVOP3SrcForVT<Src2VT, 1/*IsTrue16*/>.ret;
725+
let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
726+
let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
727+
let HasSrc2Mods = 0;
728+
let InsVOP3OpSel = getInsVOP3Base<Src0RC64, Src1RC64,
729+
Src2RC64, NumSrcArgs,
730+
HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod,
731+
Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret;
732+
let Src0VOP3DPP = VGPRSrc_16;
733+
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
734+
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
735+
let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 0/*IsFake16*/>.ret;
736+
}
717737
def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
718738
let IsTrue16 = 1;
719739
let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -765,8 +785,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
765785
// VOP2 Instructions
766786
//===----------------------------------------------------------------------===//
767787

768-
let SubtargetPredicate = isGFX11Plus in
769-
defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>;
788+
defm V_CNDMASK_B16_t16 : VOP2eInst <"v_cndmask_b16_t16", VOP2e_I16_I16_I16_I1_true16>;
789+
defm V_CNDMASK_B16_fake16 : VOP2eInst <"v_cndmask_b16_fake16", VOP2e_I16_I16_I16_I1_fake16>;
770790
defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
771791
let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
772792
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
@@ -1835,7 +1855,7 @@ defm V_FMAMK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037
18351855
defm V_FMAAK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">;
18361856

18371857
// VOP3 only.
1838-
defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
1858+
defm V_CNDMASK_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x25d, "v_cndmask_b16">;
18391859
defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x31c>;
18401860
defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31d>;
18411861
defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31e>;

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 68 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
52945294
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
52955295
; GFX11-NEXT: s_and_b32 s0, 1, s10
52965296
; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
5297-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5297+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
52985298
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5299-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5299+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
53005300
; GFX11-NEXT: s_and_b32 s1, 1, s1
5301-
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
5302-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5303-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
5304-
; GFX11-NEXT: v_mov_b32_e32 v2, s5
5301+
; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
5302+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
53055303
; GFX11-NEXT: s_ashr_i32 s0, s9, 31
5304+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
5305+
; GFX11-NEXT: v_mov_b32_e32 v2, s5
53065306
; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
53075307
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
53085308
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
54475447
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
54485448
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
54495449
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5450-
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5451-
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5452-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5453-
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5454-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5455-
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5450+
; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1]
5451+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[2:3]
54565452
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5457-
; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5458-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5453+
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
5454+
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[4:5]
5455+
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
5456+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3]
5457+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5458+
; GFX11-NEXT: v_cndmask_b16 v3, v7, v6, vcc_lo
5459+
; GFX11-NEXT: v_cndmask_b16 v2, v2, 0, s0
5460+
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v3
54595461
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5460-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5461-
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
5462-
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54635462
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
5463+
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
54645464
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
54655465
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
54665466
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5606,21 +5606,22 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
56065606
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
56075607
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
56085608
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5609-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5609+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[4:5], v[0:1]
56105610
; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
5611+
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5612+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5613+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5614+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[2:3]
5615+
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5616+
; GFX11-NEXT: s_and_b32 s0, 1, s1
56115617
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
5612-
; GFX11-NEXT: s_cselect_b32 s0, 1, 0
5613-
; GFX11-NEXT: s_and_b32 s0, 1, s0
5614-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5615-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5616-
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
56175618
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
5618-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5619-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5619+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5620+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5621+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
56205622
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5621-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
5622-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
56235623
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5624+
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
56245625
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
56255626
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
56265627
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
@@ -5846,33 +5847,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58465847
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
58475848
; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
58485849
; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5849-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5850-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5851-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5852-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5850+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[0:1]
5851+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[10:11]
58535852
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5854-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5855-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5856-
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5857-
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
5858-
; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5859-
; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5860-
; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5861-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5862-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5863-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5853+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5854+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[16:17], v[2:3]
5855+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
5856+
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5857+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11]
5858+
; GFX11-NEXT: v_add_co_u32 v10, s1, v4, v12
5859+
; GFX11-NEXT: v_add_co_ci_u32_e64 v11, s1, v5, v13, s1
5860+
; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s1, v6, v14, s1
5861+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo
5862+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0
5863+
; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[10:11], v[4:5]
5864+
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, s1, v7, v15, s1
5865+
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15]
58645866
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5865-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5866-
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5867-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5868-
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5869-
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5870-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5871-
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
5872-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5873-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5867+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
5868+
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[12:13], v[6:7]
5869+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
5870+
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
5871+
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13
5872+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
5873+
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15]
58745874
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
5875-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5875+
; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo
5876+
; GFX11-NEXT: v_cndmask_b16 v2, v4, 0, s0
58765877
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
58775878
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
58785879
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5882,10 +5883,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
58825883
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
58835884
; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
58845885
; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
5885-
; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
5886-
; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
5887-
; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
5888-
; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0
5886+
; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0
5887+
; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v6, s0
5888+
; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s0
5889+
; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, v7, s0
58895890
; GFX11-NEXT: s_setpc_b64 s[30:31]
58905891
%result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
58915892
ret <2 x i128> %result
@@ -6243,16 +6244,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62436244
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
62446245
; GFX11-NEXT: s_and_b32 s0, 1, s18
62456246
; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
6246-
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6247+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
62476248
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
6248-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6249+
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
62496250
; GFX11-NEXT: s_and_b32 s1, 1, s1
62506251
; GFX11-NEXT: s_ashr_i32 s10, s17, 31
6251-
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
6252+
; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
62526253
; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
6253-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6254-
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
6254+
; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0
62556255
; GFX11-NEXT: s_add_u32 s0, s4, s12
6256+
; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1
62566257
; GFX11-NEXT: s_addc_u32 s1, s5, s13
62576258
; GFX11-NEXT: s_addc_u32 s2, s6, s14
62586259
; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6268,17 +6269,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62686269
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
62696270
; GFX11-NEXT: s_and_b32 s4, 1, s12
62706271
; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
6271-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6272+
; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
62726273
; GFX11-NEXT: s_cselect_b32 s5, 1, 0
6273-
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6274+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
62746275
; GFX11-NEXT: s_and_b32 s5, 1, s5
6275-
; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
6276-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6277-
; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
6276+
; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
6277+
; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4
6278+
; GFX11-NEXT: s_ashr_i32 s4, s3, 31
6279+
; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
6280+
; GFX11-NEXT: v_cndmask_b16 v2, v3, 0, s5
62786281
; GFX11-NEXT: v_mov_b32_e32 v3, s8
62796282
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
62806283
; GFX11-NEXT: v_mov_b32_e32 v0, s16
6281-
; GFX11-NEXT: s_ashr_i32 s4, s3, 31
62826284
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
62836285
; GFX11-NEXT: v_mov_b32_e32 v4, s9
62846286
; GFX11-NEXT: v_mov_b32_e32 v2, s17
@@ -6287,7 +6289,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
62876289
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
62886290
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
62896291
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6290-
; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
62916292
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
62926293
; GFX11-NEXT: v_mov_b32_e32 v1, s2
62936294
; GFX11-NEXT: v_readfirstlane_b32 s1, v4

0 commit comments

Comments
 (0)