From 8cc493cf88858eeac887370b3e87b9187d75211d Mon Sep 17 00:00:00 2001 From: guochen2 Date: Thu, 12 Dec 2024 12:05:32 -0500 Subject: [PATCH 1/2] True16 for v_cndmask_b16 in MC --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 8 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 22 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 28 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 135 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 187 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 1497 +++++++++-------- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 126 +- .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 156 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 92 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 105 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 105 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 27 +- llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 144 +- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 31 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 101 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 56 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 143 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 421 +++-- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 421 +++-- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 22 +- llvm/test/CodeGen/AMDGPU/lround.ll | 8 +- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 86 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 86 +- .../AMDGPU/select-fabs-fneg-extract.f16.ll | 190 +-- .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 673 ++++---- .../AMDGPU/select-flags-to-fmin-fmax.ll | 252 +-- llvm/test/CodeGen/AMDGPU/select.f16.ll | 740 ++++---- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 425 ++++- .../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 6 +- llvm/test/MC/AMDGPU/gfx11_asm_vop3.s | 173 +- llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s | 187 +- llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s | 85 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 173 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s | 203 ++- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s | 95 +- .../Disassembler/AMDGPU/gfx11_dasm_vop3.txt | 95 +- .../AMDGPU/gfx11_dasm_vop3_dpp16.txt | 111 +- .../AMDGPU/gfx11_dasm_vop3_dpp8.txt | 57 +- .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 95 +- .../AMDGPU/gfx12_dasm_vop3_dpp16.txt | 101 +- .../AMDGPU/gfx12_dasm_vop3_dpp8.txt | 47 +- 41 files changed, 4314 insertions(+), 3401 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 5207201e14c09..6baef137df5e1 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3007,8 +3007,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: - case AMDGPU::V_CNDMASK_B16_e32: - case AMDGPU::V_CNDMASK_B16_dpp: + case AMDGPU::V_CNDMASK_B16_fake16_e32: + case AMDGPU::V_CNDMASK_B16_fake16_dpp: case AMDGPU::V_CNDMASK_B32_e32: case AMDGPU::V_CNDMASK_B32_dpp: case AMDGPU::V_DIV_FMAS_F32_e64: @@ -3023,8 +3023,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { HazardReg == AMDGPU::VCC_HI; case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: - case AMDGPU::V_CNDMASK_B16_e64: - case AMDGPU::V_CNDMASK_B16_e64_dpp: + case AMDGPU::V_CNDMASK_B16_fake16_e64: + case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp: case AMDGPU::V_CNDMASK_B32_e64: case AMDGPU::V_CNDMASK_B32_e64_dpp: case AMDGPU::V_SUBB_U32_e64: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index cdc1132579d8d..c67c73649fc3d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1245,11 +1245,29 @@ class VOPSelectPat : GCNPat < (vt (select i1:$src0, vt:$src1, vt:$src2)), (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) >; +class VOPSelectPat_t16 : GCNPat < + (vt (select i1:$src0, vt:$src1, vt:$src2)), + (V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0) +>; +class VOPSelectPat_fake16 : GCNPat < + (vt (select i1:$src0, vt:$src1, vt:$src2)), + (V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0) +>; def : VOPSelectModsPat ; def : VOPSelectModsPat ; -def : VOPSelectPat ; -def : VOPSelectPat ; +let True16Predicate = NotHasTrue16BitInsts in { + def : VOPSelectPat ; + def : VOPSelectPat ; +} // End True16Predicate = NotHasTrue16BitInsts +let True16Predicate = UseRealTrue16Insts in { + def : VOPSelectPat_t16 ; + def : VOPSelectPat_t16 ; +} // End True16Predicate = UseRealTrue16Insts +let True16Predicate = UseFakeTrue16Insts in { + def : VOPSelectPat_fake16 ; + def : VOPSelectPat_fake16 ; +} // End True16Predicate = UseFakeTrue16Insts let AddedComplexity = 1 in { def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index ca4a0fa706c30..691e1cea917bb 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -714,6 +714,26 @@ class VOP2e_SGPR ArgVT> : VOPProfile { def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>; def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>; // V_CNDMASK_B16 is VOP3 only +def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let HasOpSel = 1; + let DstRC64 = getVALUDstForVT.ret; + let Src0RC64 = getVOP3SrcForVT.ret; + let Src1RC64 = getVOP3SrcForVT.ret; + let Src2RC64 = getVOP3SrcForVT.ret; + let Src0Mod = getSrc0Mod.ret; + let Src1Mod = getSrcMod.ret; + let HasSrc2Mods = 0; + let InsVOP3OpSel = getInsVOP3Base.ret; + let Src0VOP3DPP = VGPRSrc_16; + let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src0ModVOP3DPP = getSrc0ModVOP3DPP.ret; + let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; +} def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> { let IsTrue16 = 1; let DstRC64 = getVALUDstForVT.ret; @@ -765,8 +785,10 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { // VOP2 Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGFX11Plus in -defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>; +let SubtargetPredicate = isGFX11Plus, True16Predicate = UseRealTrue16Insts in +defm V_CNDMASK_B16_t16 : VOP2eInst <"v_cndmask_b16_t16", VOP2e_I16_I16_I16_I1_true16>; +let SubtargetPredicate = isGFX11Plus, True16Predicate = UseFakeTrue16Insts in +defm V_CNDMASK_B16_fake16 : VOP2eInst <"v_cndmask_b16_fake16", VOP2e_I16_I16_I16_I1_fake16>; defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">; let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; @@ -1830,7 +1852,7 @@ defm V_FMAMK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037 defm V_FMAAK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">; // VOP3 only. -defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>; +defm V_CNDMASK_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x25d, "v_cndmask_b16">; defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x31c>; defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31d>; defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31e>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index e289ee759da15..e27d4372d87be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s10 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 +; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 @@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[2:3] ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_cndmask_b16 v3, v7, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v2, v2, 0, s0 +; GFX11-NEXT: v_xor_b32_e32 v2, v2, v3 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -5606,21 +5606,22 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[4:5], v[0:1] ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s1 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 -; GFX11-NEXT: s_cselect_b32 s0, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo @@ -5846,33 +5847,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[10:11] ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[16:17], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11] +; GFX11-NEXT: v_add_co_u32 v10, s1, v4, v12 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, s1, v5, v13, s1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s1, v6, v14, s1 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[10:11], v[4:5] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, s1, v7, v15, s1 +; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[12:13], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15] ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v2, v4, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5882,10 +5883,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6243,16 +6244,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s18 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 ; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 ; GFX11-NEXT: s_add_u32 s0, s4, s12 +; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1 ; GFX11-NEXT: s_addc_u32 s1, s5, s13 ; GFX11-NEXT: s_addc_u32 s2, s6, s14 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] @@ -6268,17 +6269,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s12 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX11-NEXT: s_and_b32 s5, 1, s5 -; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 +; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4 +; GFX11-NEXT: s_ashr_i32 s4, s3, 31 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 +; GFX11-NEXT: v_cndmask_b16 v2, v3, 0, s5 ; GFX11-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_ashr_i32 s4, s3, 31 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: v_mov_b32_e32 v2, s17 @@ -6287,7 +6289,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo -; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 43ebe156eb2a2..af96da1bb25ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5297,28 +5297,28 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: s_sub_u32 s8, s0, s4 ; GFX11-NEXT: s_subb_u32 s9, s1, s5 ; GFX11-NEXT: s_subb_u32 s10, s2, s6 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: s_subb_u32 s11, s3, s7 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s12 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_ashr_i32 s0, s11, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-NEXT: s_ashr_i32 s0, s11, 31 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, s11 ; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5470,25 +5470,26 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[4:5] +; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX11-NEXT: v_cndmask_b16 v2, v9, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 -; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5638,29 +5639,29 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, v[4:5], v[0:1] ; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-NEXT: s_and_b32 s0, 1, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5896,37 +5897,38 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[8:9] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[16:17], v[0:1] ; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[18:19], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 +; GFX11-NEXT: v_sub_co_u32 v8, s1, v4, v12 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v9, s1, v5, v13, s1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v10, s1, v6, v14, s1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, s1, v7, v15, s1 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[4:5] +; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[12:13] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[10:11], v[6:7] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[10:11], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[14:15] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v2, v5, v4, s0 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5938,8 +5940,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6303,57 +6305,57 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_sub_u32 s18, s0, s8 ; GFX11-NEXT: s_subb_u32 s19, s1, s9 ; GFX11-NEXT: s_subb_u32 s16, s2, s10 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] ; GFX11-NEXT: s_subb_u32 s17, s3, s11 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] ; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[16:17], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s20 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_ashr_i32 s8, s17, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 +; GFX11-NEXT: s_ashr_i32 s8, s17, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 ; GFX11-NEXT: s_sub_u32 s0, s4, s12 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s1 ; GFX11-NEXT: s_subb_u32 s1, s5, s13 ; GFX11-NEXT: s_subb_u32 s2, s6, s14 -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: s_subb_u32 s3, s7, s15 -; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[2:3], s[6:7] ; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 ; GFX11-NEXT: s_cselect_b32 s10, 1, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s10 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 ; GFX11-NEXT: s_and_b32 s5, 1, s5 -; GFX11-NEXT: s_ashr_i32 s4, s3, 31 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18 +; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 +; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-NEXT: v_cndmask_b16 v2, v4, v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v3, s18 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_ashr_i32 s4, s3, 31 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, s19 ; GFX11-NEXT: v_mov_b32_e32 v2, s17 @@ -6362,6 +6364,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index bc359d6ff3aaa..12e677e5546fd 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -34508,14 +34508,25 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_select_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_select_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_select_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select i1 %cond, bfloat %a, bfloat %b ret bfloat %op } @@ -34573,21 +34584,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg bfloat %a %op = select i1 %cond, bfloat %neg.a, bfloat %b @@ -34647,21 +34661,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg bfloat %b %op = select i1 %cond, bfloat %a, bfloat %neg.b @@ -34749,11 +34766,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -34761,14 +34782,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX11FAKE16-LABEL: v_select_v2bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op @@ -34856,14 +34878,19 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11TRUE16-LABEL: v_vselect_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v2.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -34872,14 +34899,15 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v5, v4, s0 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b @@ -34936,16 +34964,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_select_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_select_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_select_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s1, v0, vcc_lo +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, bfloat %a, bfloat %b %cast = bitcast bfloat %op to i16 @@ -35038,17 +35077,21 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11TRUE16-LABEL: s_select_v2bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 +; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4 -; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -35056,13 +35099,13 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 -; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s0 +; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s0, v1, vcc_lo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s1, v0, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog @@ -35156,17 +35199,20 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11TRUE16-LABEL: s_vselect_v2bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s2 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 @@ -35174,16 +35220,16 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11FAKE16-LABEL: s_vselect_v2bf16: ; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 +; GFX11FAKE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s0 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s0, v0, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s1, v1, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog @@ -36876,61 +36922,63 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX11TRUE16-LABEL: s_vselect_v4bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s4 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s5 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s4 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc_lo -; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 +; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.l, s4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_vselect_v4bf16: ; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 -; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo -; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0 -; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo +; GFX11FAKE16-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11FAKE16-NEXT: s_lshr_b32 s9, s0, 16 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11FAKE16-NEXT: s_lshr_b32 s8, s3, 16 +; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s8, v0, s6 +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s0, v1, s4 +; GFX11FAKE16-NEXT: v_cndmask_b16 v2, s2, v2, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v3, s3, v3, s5 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog %cond = icmp eq <4 x i32> %c, zeroinitializer %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b @@ -37078,53 +37126,60 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX11TRUE16-LABEL: v_vselect_v4bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v8.l, v3.l, s1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v5.l, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v4bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v7, v5, s2 +; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v6, v4, s0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v3, v0, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v4, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v2, v5, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op @@ -37368,93 +37423,95 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX11TRUE16-LABEL: v_vselect_v8bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v11.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v13.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo -; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v6 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v2 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v3 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s2 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v15.l, v11.l, s3 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v14.l, v10.l, s4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v12.l, v8.l, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v5.l, v4.l, s1 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v13.l, v9.l, s5 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v6.l, s6 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11TRUE16-NEXT: v_perm_b32 v0, v4, v5, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v6, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v2, v3, v7, 0x5040100 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_perm_b32 v3, v8, v9, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v8bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v4 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v5 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v7 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4 +; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v15, v11, s2 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v14, v10, s5 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v13, v9, s3 +; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v12, v8, s0 +; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v7, v6, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v10, v5, s1 +; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v11, v4, s4 +; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v1, v0, s6 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v6, v8, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v3, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op @@ -38024,181 +38081,176 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v34.l, v22.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v53.l, v24.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v54.l, v16.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v36.l, v21.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v51.l, v25.l -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v37.l, v28.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v38.l, v20.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v36, vcc_lo -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v37, v38, vcc_lo -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v39, v48, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v49, v50, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v51, v52, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v53, v54, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v28, v20, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v27, v19, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v24, v16, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v25, v17, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v26, v18, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v28.l, v20.l, s8 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v38.l, v37.l, s7 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v48.l, v39.l, s5 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v24.l, v16.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v30.l, v22.l, s10 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v33.l, s11 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v29.l, v21.l, s12 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v36.l, v35.l, s9 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v52.l, v51.l, s1 +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v25.l, v17.l, s2 +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v50.l, v49.l, s3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v26.l, v18.l, s4 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v0.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11TRUE16-NEXT: v_perm_b32 v0, v7, v8, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v5, v14, v15, 0x5040100 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l -; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v3, v23, vcc_lo -; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v31.l, v23.l, s14 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, v32.l, s13 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l +; GFX11TRUE16-NEXT: v_perm_b32 v2, v6, v4, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v4, v12, v13, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v6, v16, v17, 0x5040100 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11TRUE16-NEXT: v_perm_b32 v3, v10, v11, 0x5040100 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v16bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 +; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v25, v17, s2 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v24, v16, s0 +; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v54, v53, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v52, v51, s1 +; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v30, v22, s10 +; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v34, v33, s11 +; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v29, v21, s12 +; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v36, v35, s9 +; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v28, v20, s8 +; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v38, v37, s7 +; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v27, v19, s6 +; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v26, v18, s4 +; GFX11FAKE16-NEXT: v_cndmask_b16 v12, v50, v49, s3 +; GFX11FAKE16-NEXT: v_cndmask_b16 v13, v48, v39, s5 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v10, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v13, v3, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v31, v23, s14 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v14, v32, s13 +; GFX11FAKE16-NEXT: v_perm_b32 v7, v11, v10, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op @@ -39660,217 +39712,197 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:8 ; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:68 ; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v31 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v96.l, v32.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v97.l, v33.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v98.l, v34.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v33.l, s26 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v33 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v99.l, v35.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v100.l, v36.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v35.l, s29 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v101.l, v37.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v102.l, v38.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v36.l, v37.l, s27 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v36 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v103.l, v39.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v112.l, v48.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v38.l, v39.l, s24 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v39 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v38 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v113.l, v49.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v114.l, v50.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v115.l, v51.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v116.l, v52.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v48.l, v49.l, s22 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v48 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v117.l, v53.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v118.l, v54.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v119.l, v55.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v128.l, v64.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v53 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v52 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v129.l, v65.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v130.l, v66.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v131.l, v67.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v65 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v64 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v50.l, v51.l, s20 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v132.l, v68.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v68 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v133.l, v69.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v69 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v134.l, v70.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v70 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v135.l, v71.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70 -; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v71 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v80 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v81 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v146.l, v82.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v82 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11TRUE16-NEXT: v_mov_b16_e64 v147.l, v83.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 -; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v83 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v84.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84 -; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v96, v96, v97, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v84 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v97.l, v85.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85 -; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v98, v98, v99, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11TRUE16-NEXT: v_mov_b16_e64 v144.l, v80.l -; GFX11TRUE16-NEXT: v_mov_b16_e64 v145.l, v81.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v100, v101, vcc_lo -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80 -; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68 -; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v102, v103, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66 -; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v112, v113, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 -; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64 -; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v114, v115, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 -; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v116, v117, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v118, v119, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 -; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v128, v129, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v130, v131, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v132, v133, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v134, v135, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v144, v145, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v146, v147, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v85 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v86.l +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v86 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v99.l, v87.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v30, v97, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v84.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v84.l, v85.l -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v28, v99, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v86.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v85.l, v87.l -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v30, v84, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v28, v85, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo -; GFX11TRUE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v14, v29, v98, 0x5040100 -; GFX11TRUE16-NEXT: v_perm_b32 v15, v31, v96, 0x5040100 +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v86.l, v87.l, s0 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v87 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v8 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v50 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v52.l, v53.l, s18 +; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v54.l, v55.l, s16 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v64.l, v65.l, s14 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.l, v67.l, s12 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v67 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v66 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.l, v71.l, s8 +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v82.l, v83.l, s4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v10.l, v9.l, s28 +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v12.l, v11.l, s25 +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v13.l, s23 +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v18.l, v15.l, s21 +; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v22.l, v21.l, s17 +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s13 +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v30.l, v29.l, s9 +; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v32.l, v31.l, s7 +; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v34.l, v33.l, s5 +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v36.l, v35.l, s3 +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v38.l, v37.l, s1 +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v68.l, v69.l, s10 +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v80.l, v81.l, s6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v84.l, v85.l, s2 +; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v20.l, v19.l, s19 +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v23.l, s15 +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v27.l, s11 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v4.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v23.l, v3.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v24.l, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v2.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v0.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v15.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v31.l, v9.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v8.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h +; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v18, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v2, v2, v19, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v3, v3, v6, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v4, v4, v20, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v6, v12, v21, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v7, v14, v22, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v8, v11, v23, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v9, v16, v24, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v10, v10, v25, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v11, v17, v26, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v12, v31, v27, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v13, v32, v28, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v14, v33, v29, 0x5040100 +; GFX11TRUE16-NEXT: v_perm_b32 v15, v15, v30, 0x5040100 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v32bf16: @@ -39910,167 +39942,168 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8 ; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68 ; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v31 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11FAKE16-NEXT: v_cndmask_b16 v15, v32, v33, s26 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11FAKE16-NEXT: v_cndmask_b16 v14, v34, v35, s29 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v34 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11FAKE16-NEXT: v_cndmask_b16 v13, v36, v37, s27 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11FAKE16-NEXT: v_cndmask_b16 v12, v38, v39, s24 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v48, v49, s22 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v48 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11FAKE16-NEXT: v_cndmask_b16 v16, v50, v51, s20 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v51 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v50 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11FAKE16-NEXT: v_cndmask_b16 v19, v52, v53, s18 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v53 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v52 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX11FAKE16-NEXT: v_cndmask_b16 v22, v54, v55, s16 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64 +; GFX11FAKE16-NEXT: v_cndmask_b16 v25, v64, v65, s14 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v65 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v64 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v67 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v66 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v69 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v68 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v71 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v70 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v81 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v80 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v83 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v82 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v85 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v84 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b16 v54, v86, v87, s0 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v87 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v86 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11FAKE16-NEXT: v_cndmask_b16 v28, v66, v67, s12 +; GFX11FAKE16-NEXT: v_cndmask_b16 v31, v68, v69, s10 +; GFX11FAKE16-NEXT: v_cndmask_b16 v34, v70, v71, s8 +; GFX11FAKE16-NEXT: v_cndmask_b16 v37, v80, v81, s6 +; GFX11FAKE16-NEXT: v_cndmask_b16 v48, v82, v83, s4 +; GFX11FAKE16-NEXT: v_cndmask_b16 v51, v84, v85, s2 +; GFX11FAKE16-NEXT: v_cndmask_b16 v65, v4, v3, s28 +; GFX11FAKE16-NEXT: v_cndmask_b16 v66, v6, v5, s25 +; GFX11FAKE16-NEXT: v_cndmask_b16 v67, v8, v7, s23 +; GFX11FAKE16-NEXT: v_cndmask_b16 v68, v10, v9, s21 +; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v18, v17, s19 +; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v21, v20, s17 +; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v24, v23, s15 +; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v27, v26, s13 +; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v30, v29, s11 +; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v33, v32, s9 +; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v36, v35, s7 +; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v64, v55, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v53, v52, s1 +; GFX11FAKE16-NEXT: v_cndmask_b16 v17, v50, v49, s3 +; GFX11FAKE16-NEXT: v_cndmask_b16 v18, v39, v38, s5 +; GFX11FAKE16-NEXT: v_cndmask_b16 v20, v2, v1, s0 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v54, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v51, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v17, v48, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v18, v37, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v34, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v31, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v28, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v25, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v22, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v19, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v16, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v11, v68, v11, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v12, v67, v12, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v65, v14, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index b128be2186df2..beedc60225947 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1840,14 +1840,14 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX11-SDAG-LABEL: fmul_select_v2f16_test3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3c00, v2, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1967,14 +1967,14 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX11-SDAG-LABEL: fmul_select_v2f16_test4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x3800 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3c00, v2, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2121,20 +2121,20 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11-SDAG-LABEL: fmul_select_f16_test6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0xc800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4200, v1, vcc_lo ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: fmul_select_f16_test6: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x4200 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0xc800, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -2209,20 +2209,20 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11-SDAG-LABEL: fmul_select_f16_test7: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x4800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc400, v1, vcc_lo ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: fmul_select_f16_test7: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0xc400 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0x4800, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -2276,7 +2276,7 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0, 0x8000, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2625,7 +2625,7 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -2740,7 +2740,7 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -2885,24 +2885,23 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3f80, v5, s0 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v5, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -3044,24 +3043,23 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3f80, v5, s0 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v5, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -3169,7 +3167,7 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4100, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3285,7 +3283,7 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4040, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3302,10 +3300,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-LABEL: fmul_select_bf16_test6: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x4040 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0xc100, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -3401,7 +3399,7 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc080, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3418,10 +3416,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-LABEL: fmul_select_bf16_test7: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0xc080 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0x4100, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -3532,7 +3530,7 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX11-GISEL-NEXT: v_cndmask_b16 v1, 0, 0x8000, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3632,7 +3630,7 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc200, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3748,7 +3746,7 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xdb80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3860,7 +3858,7 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4c00, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index efbbe2b27f10f..7187801e5990b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -450,20 +450,22 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_4: ; %exit -; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 -; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 -; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x3900 +; GFX11-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v1, s0 +; GFX11-NEXT: v_cndmask_b16 v2, 0x3d00, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3900, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -1062,20 +1064,22 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_4: ; %exit -; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 -; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 -; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x3900 +; GFX11-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v1, s0 +; GFX11-NEXT: v_cndmask_b16 v2, 0x3d00, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3900, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -1406,34 +1410,34 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_4: ; %exit -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x3d00 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x3801, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_lt_u16_e64 s1, 0x3800, v1 +; GFX11-NEXT: v_cndmask_b16 v7, 0x3900, v5, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3900 +; GFX11-NEXT: v_cndmask_b16 v8, 0x3900, v5, s0 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8 -; GFX11-NEXT: v_perm_b32 v2, v7, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 +; GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x3801, v2 +; GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x3801, v0 +; GFX11-NEXT: v_cmp_gt_u16_e64 s3, 0x3801, v6 +; GFX11-NEXT: v_cmp_gt_u16_e64 s34, 0x3801, v4 +; GFX11-NEXT: v_cndmask_b16 v2, 0x3900, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v5, s0 +; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v5, s2 +; GFX11-NEXT: v_cndmask_b16 v4, 0x3900, v5, s3 +; GFX11-NEXT: v_cndmask_b16 v5, 0x3900, v5, s34 +; GFX11-NEXT: v_cndmask_b16 v6, 0x3d00, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v2, v5, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v6, v7, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 %cond, label %T, label %F @@ -1697,34 +1701,34 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_4: ; %exit -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x3d00 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_nge_f16_e64 s1, 0.5, v1 +; GFX11-NEXT: v_cndmask_b16 v7, 0x3900, v5, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3900 +; GFX11-NEXT: v_cndmask_b16 v8, 0x3900, v5, s0 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo -; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v8 -; GFX11-NEXT: v_pack_b32_f16 v2, v4, v7 -; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v3, v5, v6 +; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, 0.5, v6 +; GFX11-NEXT: v_cmp_ge_f16_e64 s34, 0.5, v4 +; GFX11-NEXT: v_cndmask_b16 v2, 0x3900, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v5, s0 +; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v5, s2 +; GFX11-NEXT: v_cndmask_b16 v4, 0x3900, v5, s3 +; GFX11-NEXT: v_cndmask_b16 v5, 0x3900, v5, s34 +; GFX11-NEXT: v_cndmask_b16 v6, 0x3d00, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX11-NEXT: v_pack_b32_f16 v1, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_pack_b32_f16 v2, v8, v5 +; GFX11-NEXT: v_pack_b32_f16 v3, v7, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 %cond, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index c3c1540383ec6..ffe7649e4bbb1 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -645,36 +645,36 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s4, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -865,69 +865,69 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16 ; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v9, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v9, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s4, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s4, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s4, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s4, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v6, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s4, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v7, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index e874ee56f594c..91f57e644ec72 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -62,7 +62,7 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16: @@ -151,11 +151,12 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v0, v1 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -262,12 +263,13 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -395,22 +397,23 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v0, v2 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s1, v5, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s2, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -615,36 +618,36 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v11, v10 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s1, v13, v12 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s3, v0, v4 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s2, v15, v14 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s4, v1, v5 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s5, v2, v6 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s6, v3, v7 +; GFX11-SAFE-NEXT: v_cndmask_b16 v12, v12, v13, s1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v14, v14, v15, s2 +; GFX11-SAFE-NEXT: v_cndmask_b16 v10, v10, v11, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v4, v0, s3 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v5, v1, s4 +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v6, v2, s5 +; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v7, v3, s6 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 0723290bdf734..b7e9e15a0561f 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -63,7 +63,7 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16: @@ -152,11 +152,12 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -263,12 +264,13 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -396,22 +398,23 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -616,36 +619,36 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v11, v10 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, v13, v12 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s3, v0, v4 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s2, v15, v14 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s4, v1, v5 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s5, v2, v6 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s6, v3, v7 +; GFX11-SAFE-NEXT: v_cndmask_b16 v12, v12, v13, s1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v14, v14, v15, s2 +; GFX11-SAFE-NEXT: v_cndmask_b16 v10, v10, v11, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v4, v0, s3 +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v5, v1, s4 +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v6, v2, s5 +; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v7, v3, s6 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 64be9cb72a6ee..6c1a7ac56a867 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -425,15 +425,16 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s0, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s1, -1.0 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| -; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 s2, |v0|, |v1| +; GFX11-DENORM-NEXT: v_cndmask_b16 v0, v1, v0, s2 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] @@ -444,18 +445,18 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s0, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s1, -1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 s0, |v0|, |v1| +; GFX11-FLUSH-NEXT: v_cndmask_b16 v0, v1, v0, s0 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| -; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v0, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v1, v0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 9ae60f99d5e09..bef424c5287fe 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -4947,9 +4947,9 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5400 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5009,9 +5009,9 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5071,9 +5071,9 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0xd400 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5133,9 +5133,9 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0xbc00 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5202,9 +5202,9 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5800 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,9 +5267,9 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5800 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5336,9 +5336,9 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x4000 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5401,9 +5401,9 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5800, v0, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5478,10 +5478,10 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5560,10 +5560,10 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5644,10 +5644,10 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5738,10 +5738,10 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5838,10 +5838,10 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5928,10 +5928,10 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xcc00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xcc00, v3, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6013,10 +6013,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6099,10 +6099,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6187,10 +6187,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xbc00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xbc00, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6276,10 +6276,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6374,10 +6374,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6464,10 +6464,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4400, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6561,10 +6561,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4400, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6651,10 +6651,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5800, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5800, v4, s0 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index b32630a97b3ad..589804177a747 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -558,12 +558,12 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v0, s0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo ; GFX11-SAFE-NEXT: ; return to shader part epilog ; ; GFX11-NSZ-LABEL: fneg_fadd_0_f16: @@ -573,10 +573,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, v0, s0, s1 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv half 1.000000e+00, %tmp6 @@ -646,13 +646,12 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; ; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16: ; GFX11-SAFE: ; %bb.0: ; %.entry +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x8000, v0, s1 ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo ; GFX11-SAFE-NEXT: ; return to shader part epilog ; ; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16: @@ -662,10 +661,10 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, v0, s0, s1 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn half 1.000000e+00, %tmp6 @@ -3835,7 +3834,7 @@ define half @v_fneg_round_f16(half %a) #0 { ; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 @@ -3850,7 +3849,7 @@ define half @v_fneg_round_f16(half %a) #0 { ; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-NSZ-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 @@ -4677,7 +4676,7 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index b2d30b751ae2c..0077951c4967e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -164,7 +164,7 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, i16 %arg0, i16 %arg1 @@ -210,14 +210,15 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v1, v5, v4, s0 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 @@ -256,7 +257,7 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; GFX11-NEXT: global_store_b16 v[3:4], v1, off ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -582,16 +583,16 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v2 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = fneg half %arg0 %select0 = select i1 %cond0, half %arg1, half %fneg0 @@ -618,16 +619,16 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v2 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i16 %arg0, -32768 %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0 @@ -702,29 +703,30 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX11-LABEL: select_fneg_select_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v4, v5, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_perm_b32 v4, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_perm_b32 v4, v0, v1, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, v1, v4, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = fneg <2 x half> %arg0 %select0 = select <2 x i1> %cond0, <2 x half> %arg1, <2 x half> %fneg0 @@ -787,29 +789,30 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX11-LABEL: select_fneg_xor_select_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v4, v5, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_perm_b32 v4, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_perm_b32 v4, v0, v1, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, v1, v4, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor <2 x i16> %arg0, %select0 = select <2 x i1> %cond0, <2 x i16> %arg1, <2 x i16> %fneg0 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 80b4d64b1236f..e6513fa7b920f 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2366,10 +2366,10 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f16_e32 v3, v0 -; GFX11-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| +; GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x7c00, |v0| ; GFX11-NEXT: v_floor_f16_e32 v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0, v3, s0 ; GFX11-NEXT: global_store_b16 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2381,10 +2381,10 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_fract_f16_e32 v3, v0 -; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| +; GFX12-NEXT: v_cmp_neq_f16_e64 s0, 0x7c00, |v0| ; GFX12-NEXT: v_floor_f16_e32 v4, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b16 v0, 0, v3, s0 ; GFX12-NEXT: global_store_b16 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -2539,19 +2539,19 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_fract_f16_e32 v6, v0 -; GFX11-NEXT: v_floor_f16_e32 v5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fract_f16_e32 v4, v3 +; GFX11-NEXT: v_fract_f16_e32 v4, v0 +; GFX11-NEXT: v_cmp_class_f16_e64 s1, v0, 0x204 +; GFX11-NEXT: v_floor_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fract_f16_e32 v5, v3 ; GFX11-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 -; GFX11-NEXT: v_floor_f16_e32 v7, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 -; GFX11-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX11-NEXT: v_floor_f16_e32 v3, v3 +; GFX11-NEXT: v_cndmask_b16 v4, v4, 0, s1 +; GFX11-NEXT: v_cndmask_b16 v5, v5, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v4, v5, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v4, off -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX11-NEXT: v_pack_b32_f16 v3, v0, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 +; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_v2f16: @@ -2562,19 +2562,19 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_fract_f16_e32 v6, v0 -; GFX12-NEXT: v_floor_f16_e32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_fract_f16_e32 v4, v3 +; GFX12-NEXT: v_fract_f16_e32 v4, v0 +; GFX12-NEXT: v_cmp_class_f16_e64 s1, v0, 0x204 +; GFX12-NEXT: v_floor_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_fract_f16_e32 v5, v3 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 -; GFX12-NEXT: v_floor_f16_e32 v7, v3 -; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 -; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX12-NEXT: v_floor_f16_e32 v3, v3 +; GFX12-NEXT: v_cndmask_b16 v4, v4, 0, s1 +; GFX12-NEXT: v_cndmask_b16 v5, v5, 0, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 -; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 -; GFX12-NEXT: global_store_b32 v[1:2], v4, off -; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX12-NEXT: v_pack_b32_f16 v3, v0, v3 +; GFX12-NEXT: v_pack_b32_f16 v0, v4, v5 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index d09af8fd2ac95..da891a709ac1c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2513,36 +2513,37 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 4 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s10, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 +; GFX11-NEXT: s_cselect_b32 s5, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b16 v5, v3, s4, s2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b16 v6, v2, s4, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v7, v1, s4, s8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_cndmask_b16 v0, v0, s4, s10 +; GFX11-NEXT: v_cndmask_b16 v3, v3, s4, s3 +; GFX11-NEXT: v_cndmask_b16 v2, v2, s4, s7 +; GFX11-NEXT: v_cndmask_b16 v1, v1, s4, s9 +; GFX11-NEXT: v_cndmask_b16 v8, v8, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm @@ -3082,69 +3083,69 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 4 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s10, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s11, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 14 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s12, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 15 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 12 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 13 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s15, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 10 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s16, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 11 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s17, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 8 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s18, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 9 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 -; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 +; GFX11-NEXT: s_cselect_b32 s5, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b16 v9, v3, s4, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b16 v13, v7, s4, s12 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_cndmask_b16 v14, v6, s4, s14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_cndmask_b16 v15, v5, s4, s16 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_cndmask_b16 v16, v4, s4, s18 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b16 v10, v2, s4, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v11, v1, s4, s8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b16 v12, v0, s4, s10 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b16 v7, v7, s4, s13 +; GFX11-NEXT: v_cndmask_b16 v6, v6, s4, s15 +; GFX11-NEXT: v_cndmask_b16 v5, v5, s4, s17 +; GFX11-NEXT: v_cndmask_b16 v4, v4, s4, s5 +; GFX11-NEXT: v_cndmask_b16 v3, v3, s4, s3 +; GFX11-NEXT: v_cndmask_b16 v2, v2, s4, s7 +; GFX11-NEXT: v_cndmask_b16 v1, v1, s4, s9 +; GFX11-NEXT: v_cndmask_b16 v0, v0, s4, s11 +; GFX11-NEXT: v_perm_b32 v7, v7, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v5, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x5040100 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 1d0367db70143..60c3f8f60bccc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -60,10 +60,10 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX11-LABEL: v_maximum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16: @@ -180,10 +180,10 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX11-LABEL: v_maximum_f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nsz: @@ -306,10 +306,10 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nnan_src0: @@ -387,10 +387,10 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nnan_src1: @@ -485,10 +485,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX11-LABEL: s_maximum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_max_f16_e64 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -590,17 +590,17 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_maximum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16: @@ -749,17 +749,17 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_maximum_v2f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16__nsz: @@ -939,17 +939,18 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-LABEL: s_maximum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_pk_max_f16 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, s0, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -1063,21 +1064,20 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_maximum_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16: @@ -1255,21 +1255,20 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_maximum_v3f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16__nsz: @@ -1469,26 +1468,26 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_maximum_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v8, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16: @@ -1695,26 +1694,26 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_maximum_v4f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v8, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16__nsz: @@ -1999,44 +1998,44 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX11-LABEL: v_maximum_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v8, v3, v7 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_pk_max_f16 v10, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX11-NEXT: v_pk_max_f16 v14, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-NEXT: v_pk_max_f16 v8, v3, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 -; GFX11-NEXT: v_pk_max_f16 v11, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 +; GFX11-NEXT: v_pk_max_f16 v9, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v10, 0x7e00, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v9, s0 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v12, v0, v4 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v6, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s3, v0, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s4, v11, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v12, s3 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s4 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s2 +; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, s0 +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v2, v4, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v8f16: @@ -2402,86 +2401,78 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX11-LABEL: v_maximum_v16f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-NEXT: v_pk_max_f16 v18, v7, v15 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v6, v14 ; GFX11-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_pk_max_f16 v20, v4, v12 -; GFX11-NEXT: v_pk_max_f16 v22, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v17, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GFX11-NEXT: v_cndmask_b16 v16, 0x7e00, v18, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-NEXT: v_pk_max_f16 v20, v5, v13 +; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v7, s0 +; GFX11-NEXT: v_cndmask_b16 v17, 0x7e00, v15, s1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GFX11-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_pk_max_f16 v17, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v13 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v19, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v4, v12 +; GFX11-NEXT: v_pk_max_f16 v13, v4, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 -; GFX11-NEXT: v_pk_max_f16 v19, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-NEXT: v_cndmask_b16 v6, 0x7e00, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v14, 0x7e00, v20, s0 +; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v5, s1 +; GFX11-NEXT: v_cndmask_b16 v15, 0x7e00, v13, s2 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 +; GFX11-NEXT: v_pk_max_f16 v12, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 +; GFX11-NEXT: v_pk_max_f16 v13, v2, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v22, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v18, 0x7e00, v12, s0 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX11-NEXT: v_cndmask_b16 v11, 0x7e00, v13, s1 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-NEXT: v_pk_max_f16 v20, v0, v8 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v9 +; GFX11-NEXT: v_cmp_o_f16_e64 s3, v10, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s4, v0, v8 +; GFX11-NEXT: v_cmp_o_f16_e64 s5, v19, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s2 +; GFX11-NEXT: v_cndmask_b16 v9, 0x7e00, v20, s4 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s5 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s3 +; GFX11-NEXT: v_cndmask_b16 v8, 0x7e00, v8, s1 +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, s0 +; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v8, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f8c2c54af2783..99d0916ae7a28 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -48,10 +48,10 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX11-LABEL: v_minimum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16: @@ -145,10 +145,10 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX11-LABEL: v_minimum_f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nsz: @@ -247,10 +247,10 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nnan_src0: @@ -314,10 +314,10 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nnan_src1: @@ -395,10 +395,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX11-LABEL: s_minimum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_min_f16_e64 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -480,17 +480,17 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_minimum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16: @@ -604,17 +604,17 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_minimum_v2f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16__nsz: @@ -752,17 +752,18 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-LABEL: s_minimum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_pk_min_f16 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_o_f16_e64 s0, s0, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -849,21 +850,20 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_minimum_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16: @@ -994,21 +994,20 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_minimum_v3f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16__nsz: @@ -1154,26 +1153,26 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_minimum_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v8, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16: @@ -1321,26 +1320,26 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_minimum_v4f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v8, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16__nsz: @@ -1538,44 +1537,44 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX11-LABEL: v_minimum_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v8, v3, v7 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_pk_min_f16 v10, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX11-NEXT: v_pk_min_f16 v14, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-NEXT: v_pk_min_f16 v8, v3, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 -; GFX11-NEXT: v_pk_min_f16 v11, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 +; GFX11-NEXT: v_pk_min_f16 v9, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v10, 0x7e00, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v9, s0 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v12, v0, v4 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v6, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s3, v0, v4 +; GFX11-NEXT: v_cmp_o_f16_e64 s4, v11, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 +; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v12, s3 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s4 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s2 +; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, s0 +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v2, v4, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v8f16: @@ -1821,86 +1820,78 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX11-LABEL: v_minimum_v16f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-NEXT: v_pk_min_f16 v18, v7, v15 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v6, v14 ; GFX11-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_pk_min_f16 v20, v4, v12 -; GFX11-NEXT: v_pk_min_f16 v22, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v17, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GFX11-NEXT: v_cndmask_b16 v16, 0x7e00, v18, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-NEXT: v_pk_min_f16 v20, v5, v13 +; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v7, s0 +; GFX11-NEXT: v_cndmask_b16 v17, 0x7e00, v15, s1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GFX11-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_pk_min_f16 v17, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v13 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v19, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v4, v12 +; GFX11-NEXT: v_pk_min_f16 v13, v4, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 -; GFX11-NEXT: v_pk_min_f16 v19, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-NEXT: v_cndmask_b16 v6, 0x7e00, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v14, 0x7e00, v20, s0 +; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v5, s1 +; GFX11-NEXT: v_cndmask_b16 v15, 0x7e00, v13, s2 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 +; GFX11-NEXT: v_pk_min_f16 v12, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 +; GFX11-NEXT: v_pk_min_f16 v13, v2, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v22, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v18, 0x7e00, v12, s0 +; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX11-NEXT: v_cndmask_b16 v11, 0x7e00, v13, s1 +; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 +; GFX11-NEXT: v_cmp_o_f16_e64 s2, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-NEXT: v_pk_min_f16 v20, v0, v8 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v9 +; GFX11-NEXT: v_cmp_o_f16_e64 s3, v10, v2 +; GFX11-NEXT: v_cmp_o_f16_e64 s4, v0, v8 +; GFX11-NEXT: v_cmp_o_f16_e64 s5, v19, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s2 +; GFX11-NEXT: v_cndmask_b16 v9, 0x7e00, v20, s4 +; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s5 +; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s3 +; GFX11-NEXT: v_cndmask_b16 v8, 0x7e00, v8, s1 +; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, s0 +; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v8, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index c0a85bba93b73..ed029a3c6a259 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -838,7 +838,7 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 ; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 +; GFX11-NEXT: v_cndmask_b16 v1, 0, 0x3c00, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -980,20 +980,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 ; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f16_e64 s7, |v3|, 0.5 +; GFX11-NEXT: v_cmp_ge_f16_e64 s6, |v2|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v3, 0, 0x3c00, s7 +; GFX11-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 072ee70b840d8..f1678bb8ee4d4 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -824,7 +824,7 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-SDAG-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SDAG-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0, v1, v0 @@ -839,7 +839,7 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 ; GFX11-GISEL-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-GISEL-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v1, v0 @@ -915,7 +915,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-SDAG-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SDAG-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0, v1, v0 @@ -933,7 +933,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 ; GFX11-GISEL-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-GISEL-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 ; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index fa15a42aef2ac..4476d0f43ec4a 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -236,18 +236,19 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, v1, v0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 @@ -261,13 +262,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximumnum_bf16: @@ -278,18 +278,19 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b16 v1, v1, v0, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 @@ -303,13 +304,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result @@ -369,17 +369,18 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximumnum_bf16_nnan: @@ -391,17 +392,18 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo -; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index f5fb85d63b8e4..0cdbec9dd094a 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -238,18 +238,19 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, v1, v0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 @@ -263,13 +264,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimumnum_bf16: @@ -280,18 +280,19 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b16 v1, v1, v0, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 @@ -305,13 +306,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result @@ -373,17 +373,18 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimumnum_bf16_nnan: @@ -395,17 +396,18 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo -; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 -; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 7c1da18de70f8..e0ea9116e214f 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -34,7 +34,7 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -77,7 +77,7 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v1, |v1|, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v4 @@ -123,7 +123,7 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 @@ -169,7 +169,7 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v1, |v2|, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 @@ -212,10 +212,10 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -251,10 +251,10 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: add_select_fabs_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -289,10 +289,10 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_fabs_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -326,10 +326,10 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_posk_posk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0x4000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -367,7 +367,7 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -407,7 +407,7 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xe400, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -443,7 +443,7 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -480,7 +480,7 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -518,7 +518,7 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -561,7 +561,7 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v1, v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -607,7 +607,7 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -653,7 +653,7 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v1, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -696,10 +696,10 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fneg_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -735,7 +735,7 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -773,7 +773,7 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xb118, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -811,7 +811,7 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3118, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -846,10 +846,10 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -884,10 +884,10 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_negliteralk_negliteralk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xe800 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0xe800 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xec00, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -920,10 +920,10 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_fneg_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -959,7 +959,7 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -996,7 +996,7 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1033,7 +1033,7 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1072,11 +1072,11 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_negfabs_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1116,11 +1116,11 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_negfabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_or_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1160,11 +1160,11 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_neg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1203,11 +1203,11 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_neg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1245,10 +1245,10 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_neg_negfabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1287,10 +1287,10 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_negfabs_neg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1328,10 +1328,10 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: mul_select_negfabs_posk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1371,7 +1371,7 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1408,10 +1408,10 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: mul_select_negfabs_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1451,7 +1451,7 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1493,8 +1493,8 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; GFX11-SAFE-NEXT: v_add_f16_e32 v1, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16: @@ -1519,10 +1519,10 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, -4.0, v1 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, -4.0, v1 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fadd half %x, 4.0 @@ -1558,8 +1558,8 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; GFX11-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: @@ -1584,10 +1584,10 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, 4.0, v1 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fsub half %x, 4.0 @@ -1619,10 +1619,10 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; GFX11-LABEL: select_fneg_posk_src_mul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %mul = fmul half %x, 4.0 @@ -1660,8 +1660,8 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: @@ -1688,10 +1688,10 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: v_fma_f16 v0, v1, -4.0, -v2 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z) @@ -1730,8 +1730,8 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: @@ -1759,10 +1759,10 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: v_fma_f16 v0, v1, -4.0, -v2 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index d2bb971b68030..d5b5b052e7ccb 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -68,18 +68,18 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fabs_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -161,17 +161,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v3, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_pk_add_f16 v1, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -246,19 +246,19 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v3, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -342,17 +342,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_pk_add_f16 v1, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -424,15 +424,16 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -496,14 +497,15 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -567,13 +569,14 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -631,12 +634,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -700,13 +703,13 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -772,13 +775,13 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0xe400, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xe400, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -842,14 +845,15 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -914,13 +918,13 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -985,15 +989,15 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1069,16 +1073,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v6, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1153,16 +1157,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1240,16 +1244,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v6, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1325,15 +1329,16 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1394,14 +1399,14 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_negk_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1463,14 +1468,14 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_fneg_inv2pi_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xb118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xb118, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1532,14 +1537,14 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-LABEL: add_select_fneg_neginv2pi_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0x3118, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1597,12 +1602,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1661,12 +1666,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xe800 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xec00, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xec00, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1723,12 +1728,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1789,14 +1794,14 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_negk_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1857,14 +1862,14 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_posk_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1925,14 +1930,14 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_posk_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2007,16 +2012,16 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2091,18 +2096,18 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_fabs_negfabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v3, 0x80008000, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2179,16 +2184,16 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2262,18 +2267,18 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11-LABEL: add_select_fabs_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 +; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2342,15 +2347,16 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2419,16 +2425,17 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_negfabs_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v1, v2, v3, s0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2498,14 +2505,15 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0x4400, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2575,13 +2583,13 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0x4400, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2650,14 +2658,15 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0xc400, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2727,13 +2736,13 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0xc400, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2809,15 +2818,16 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: @@ -2868,14 +2878,14 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fadd <2 x half> %x, @@ -2945,15 +2955,16 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: @@ -3004,14 +3015,14 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fsub <2 x half> %x, @@ -3069,14 +3080,14 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %mul = fmul <2 x half> %x, @@ -3152,15 +3163,16 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: @@ -3197,14 +3209,14 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) @@ -3282,15 +3294,16 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: @@ -3348,14 +3361,14 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 50a3336a7483c..5111870da5a63 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -534,7 +534,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select i1 %cmp, half %a, half %b @@ -567,7 +567,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select nnan i1 %cmp, half %a, half %b @@ -600,7 +600,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select nsz i1 %cmp, half %a, half %b @@ -664,7 +664,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select i1 %cmp, half %a, half %b @@ -697,7 +697,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select nnan i1 %cmp, half %a, half %b @@ -730,7 +730,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select nsz i1 %cmp, half %a, half %b @@ -806,11 +806,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -856,11 +857,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -906,11 +908,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -992,11 +995,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1042,11 +1046,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1092,11 +1097,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1193,22 +1199,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1268,22 +1275,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1343,22 +1351,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1466,22 +1475,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1541,22 +1551,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1616,22 +1627,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 +; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 +; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 +; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 572026da79646..28b0d2cfba731 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -101,12 +101,12 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: s_mov_b32 s17, s11 ; GFX11-NEXT: s_mov_b32 s20, s12 ; GFX11-NEXT: s_mov_b32 s21, s13 -; GFX11-NEXT: s_mov_b32 s24, s14 -; GFX11-NEXT: s_mov_b32 s25, s15 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s24, s14 +; GFX11-NEXT: s_mov_b32 s25, s15 ; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc @@ -114,7 +114,7 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: s_mov_b32 s4, s8 ; GFX11-NEXT: s_mov_b32 s5, s9 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm @@ -210,25 +210,25 @@ define amdgpu_kernel void @select_f16_imm_a( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -321,25 +321,25 @@ define amdgpu_kernel void @select_f16_imm_b( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -433,25 +433,25 @@ define amdgpu_kernel void @select_f16_imm_c( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 ; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -545,25 +545,25 @@ define amdgpu_kernel void @select_f16_imm_d( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 ; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -686,12 +686,12 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s2 -; GFX11-NEXT: s_mov_b32 s7, s3 ; GFX11-NEXT: s_mov_b32 s22, s2 ; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_mov_b32 s18, s2 ; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s6, s2 +; GFX11-NEXT: s_mov_b32 s7, s3 ; GFX11-NEXT: s_mov_b32 s26, s2 ; GFX11-NEXT: s_mov_b32 s27, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -699,28 +699,30 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s21, s13 ; GFX11-NEXT: s_mov_b32 s16, s10 ; GFX11-NEXT: s_mov_b32 s17, s11 +; GFX11-NEXT: buffer_load_b32 v0, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: s_mov_b32 s24, s14 ; GFX11-NEXT: s_mov_b32 s25, s15 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s0, s8 ; GFX11-NEXT: s_mov_b32 s1, s9 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_cmp_lt_f16_e64 s0, v5, v4 +; GFX11-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: s_mov_b32 s0, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -827,42 +829,42 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s16, s8 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3 +; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -966,42 +968,42 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s16, s8 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3 +; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1107,42 +1109,42 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_nlt_f16_e64 s0, v4, v3 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1248,42 +1250,42 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_f16_e64 s0, v4, v3 +; GFX11-NEXT: v_cndmask_b16 v1, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1403,22 +1405,23 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; GFX11-LABEL: v_vselect_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-NEXT: v_cndmask_b16 v4, v7, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b16 v5, v9, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b16 v1, v3, v1, s2 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <4 x i32> %cond, zeroinitializer %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1596,37 +1599,36 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; GFX11-LABEL: v_vselect_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX11-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v10 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v9 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v11 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v14 +; GFX11-NEXT: v_cndmask_b16 v8, v9, v8, s5 +; GFX11-NEXT: v_cndmask_b16 v9, v11, v10, s4 +; GFX11-NEXT: v_cndmask_b16 v10, v13, v12, s3 +; GFX11-NEXT: v_cndmask_b16 v11, v16, v15, s2 +; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v5, v1, s0 +; GFX11-NEXT: v_cndmask_b16 v2, v6, v2, s1 +; GFX11-NEXT: v_cndmask_b16 v3, v7, v3, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v11, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v2, v9, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v8, v3, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <8 x i32> %cond, zeroinitializer %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b @@ -1990,67 +1992,64 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v4, v10, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v20 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v22 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v24 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v26 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 +; GFX11-NEXT: v_cmp_eq_u32_e64 s7, 0, v17 +; GFX11-NEXT: v_cmp_eq_u32_e64 s8, 0, v19 +; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 0, v21 +; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 0, v23 +; GFX11-NEXT: v_cmp_eq_u32_e64 s11, 0, v25 +; GFX11-NEXT: v_cmp_eq_u32_e64 s12, 0, v27 +; GFX11-NEXT: v_cmp_eq_u32_e64 s13, 0, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-NEXT: v_cndmask_b16 v0, v8, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v18, v19, v18, s13 +; GFX11-NEXT: v_cndmask_b16 v19, v21, v20, s12 +; GFX11-NEXT: v_cndmask_b16 v20, v23, v22, s11 +; GFX11-NEXT: v_cndmask_b16 v21, v25, v24, s10 +; GFX11-NEXT: v_cndmask_b16 v22, v27, v26, s9 +; GFX11-NEXT: v_cndmask_b16 v23, v29, v28, s8 +; GFX11-NEXT: v_cndmask_b16 v24, v32, v30, s7 +; GFX11-NEXT: v_cndmask_b16 v7, v15, v7, s6 +; GFX11-NEXT: v_cndmask_b16 v6, v14, v6, s5 +; GFX11-NEXT: v_cndmask_b16 v5, v13, v5, s4 +; GFX11-NEXT: v_cndmask_b16 v4, v12, v4, s3 +; GFX11-NEXT: v_cndmask_b16 v1, v9, v1, s0 +; GFX11-NEXT: v_cndmask_b16 v2, v10, v2, s1 +; GFX11-NEXT: v_cndmask_b16 v3, v11, v3, s2 +; GFX11-NEXT: v_perm_b32 v0, v24, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v20, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v23, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v22, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v21, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v19, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v18, v6, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v12, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v7, v11, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b16 v8, v17, v16, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <16 x i32> %cond, zeroinitializer %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b @@ -2922,39 +2921,40 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:96 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 ; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v30 ; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v13 @@ -2982,131 +2982,123 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v18 ; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 ; GFX11-NEXT: s_waitcnt vmcnt(32) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v34 ; GFX11-NEXT: s_waitcnt vmcnt(28) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v35 ; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v36 ; GFX11-NEXT: s_waitcnt vmcnt(26) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v37 ; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v38 ; GFX11-NEXT: s_waitcnt vmcnt(24) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s7, 0, v39 ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s8, 0, v48 ; GFX11-NEXT: s_waitcnt vmcnt(22) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49 -; GFX11-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 0, v49 ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50 -; GFX11-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 0, v50 ; GFX11-NEXT: s_waitcnt vmcnt(20) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51 -; GFX11-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s11, 0, v51 ; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52 -; GFX11-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s12, 0, v52 ; GFX11-NEXT: s_waitcnt vmcnt(18) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s13, 0, v53 ; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s14, 0, v54 ; GFX11-NEXT: s_waitcnt vmcnt(16) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s15, 0, v55 ; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s16, 0, v64 ; GFX11-NEXT: s_waitcnt vmcnt(14) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s17, 0, v65 ; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s18, 0, v66 ; GFX11-NEXT: s_waitcnt vmcnt(12) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s19, 0, v67 ; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s20, 0, v68 ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 -; GFX11-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s21, 0, v69 ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s22, 0, v70 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 -; GFX11-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s23, 0, v71 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s24, 0, v80 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 -; GFX11-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s25, 0, v81 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s26, 0, v82 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 -; GFX11-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v83 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s27, 0, v84 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s28, 0, v85 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 -; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s29, 0, v86 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 -; GFX11-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_hi, 0, v87 +; GFX11-NEXT: v_cndmask_b16 v34, v34, v96, s26 +; GFX11-NEXT: v_cndmask_b16 v35, v98, v97, s27 +; GFX11-NEXT: v_cndmask_b16 v36, v100, v99, s28 +; GFX11-NEXT: v_cndmask_b16 v37, v102, v101, s29 +; GFX11-NEXT: v_cndmask_b16 v38, v112, v103, vcc_hi +; GFX11-NEXT: v_cndmask_b16 v39, v114, v113, s25 +; GFX11-NEXT: v_cndmask_b16 v48, v116, v115, s24 +; GFX11-NEXT: v_cndmask_b16 v49, v118, v117, s23 +; GFX11-NEXT: v_cndmask_b16 v50, v128, v119, s22 +; GFX11-NEXT: v_cndmask_b16 v51, v130, v129, s21 +; GFX11-NEXT: v_cndmask_b16 v52, v132, v131, s20 +; GFX11-NEXT: v_cndmask_b16 v53, v134, v133, s19 +; GFX11-NEXT: v_cndmask_b16 v54, v144, v135, s18 +; GFX11-NEXT: v_cndmask_b16 v55, v146, v145, s17 +; GFX11-NEXT: v_cndmask_b16 v31, v31, v147, s16 +; GFX11-NEXT: v_cndmask_b16 v32, v33, v32, s15 +; GFX11-NEXT: v_cndmask_b16 v15, v83, v15, s14 +; GFX11-NEXT: v_cndmask_b16 v14, v30, v14, s13 +; GFX11-NEXT: v_cndmask_b16 v13, v29, v13, s12 +; GFX11-NEXT: v_cndmask_b16 v12, v28, v12, s11 +; GFX11-NEXT: v_cndmask_b16 v11, v27, v11, s10 +; GFX11-NEXT: v_cndmask_b16 v10, v26, v10, s9 +; GFX11-NEXT: v_cndmask_b16 v9, v25, v9, s8 +; GFX11-NEXT: v_cndmask_b16 v8, v24, v8, s7 +; GFX11-NEXT: v_cndmask_b16 v7, v23, v7, s6 +; GFX11-NEXT: v_cndmask_b16 v6, v22, v6, s5 +; GFX11-NEXT: v_cndmask_b16 v5, v21, v5, s4 +; GFX11-NEXT: v_cndmask_b16 v0, v16, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b16 v1, v17, v1, s0 +; GFX11-NEXT: v_cndmask_b16 v2, v18, v2, s1 +; GFX11-NEXT: v_cndmask_b16 v3, v19, v3, s2 +; GFX11-NEXT: v_cndmask_b16 v4, v20, v4, s3 +; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v54, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v53, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v52, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v51, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v49, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <32 x i32> %cond, zeroinitializer %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index f20c1ccb2d63e..13e1da4a96c8d 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fabs.f16(half) @@ -90,6 +91,24 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_cnd_nan_nosgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s2, 0 +; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx %f = load float, ptr addrspace(1) %f.gep @@ -155,6 +174,18 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_cnd_nan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s2, 0 +; GFX12-NEXT: s_cselect_b32 s2, s3, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f store float %select, ptr addrspace(1) %out @@ -220,6 +251,21 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, s1, 1.0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -285,6 +331,19 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, 1.0 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -350,6 +409,21 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, s1, 0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -415,6 +489,19 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, 0 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -498,6 +585,23 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 +; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext @@ -583,6 +687,23 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 +; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext @@ -661,6 +782,21 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -751,6 +887,24 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -843,6 +997,24 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -939,6 +1111,25 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1048,6 +1239,28 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1157,6 +1370,28 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1268,6 +1503,28 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1375,6 +1632,29 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX12-NEXT: global_store_b8 v0, v1, s[8:9] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1479,6 +1759,26 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1581,6 +1881,26 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1674,6 +1994,24 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1783,6 +2121,28 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 +; GFX12-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -1882,14 +2242,35 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s[2:3] ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_cndmask_abs_neg_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX12-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s[2:3] +; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx %f = load half, ptr addrspace(1) %f.gep @@ -1981,6 +2362,24 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_cndmask_abs_neg_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx %f = load float, ptr addrspace(1) %f.gep @@ -2086,6 +2485,28 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_cndmask_abs_neg_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GFX12-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx %f = load double, ptr addrspace(1) %f.gep diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index c936c13ac6c66..d91ee54215924 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -159,16 +159,16 @@ name: mask_hazard_cndmask_dpp3 body: | bb.0: ; GFX11-LABEL: name: mask_hazard_cndmask_dpp3 - ; GFX11: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + ; GFX11: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 ; GFX11-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: mask_hazard_cndmask_dpp3 - ; GFX12: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + ; GFX12: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc ; GFX12-NEXT: S_ENDPGM 0 - $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc S_ENDPGM 0 ... diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s index 6bc92bc29ea8a..40e3fbda47787 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -899,104 +899,131 @@ v_bfm_b32 v5, src_scc, vcc_lo v_bfm_b32 v255, 0xaf123456, vcc_hi // GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cndmask_b16 v5, v1, src_scc, s3 -// W32: v_cndmask_b16 v5, v1, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] -// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction +v_cndmask_b16 v5.l, v1.l, src_scc, s3 +// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction -v_cndmask_b16 v5, v255, 0.5, s3 -// W32: v_cndmask_b16 v5, v255, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] -// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.l, 0.5, s3 +// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction -v_cndmask_b16 v5, s105, s105, s3 -// W32: v_cndmask_b16 v5, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction - -v_cndmask_b16 v5, vcc_hi, v2, s3 -// W32: v_cndmask_b16 v5, vcc_hi, v2, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, s105, s105, s3 +// W32: v_cndmask_b16 v5.l, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, ttmp15, ttmp15, s3 -// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 +// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, m0, v255, s3 -// W32: v_cndmask_b16 v5, m0, v255, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 +// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_lo, exec_lo, s3 -// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, m0, v255.l, s3 +// W32: v_cndmask_b16 v5.l, m0, v255.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_hi, exec_hi, s3 -// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 +// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, null, m0, s105 -// W32: v_cndmask_b16 v5, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 +// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo -// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, null, m0, s105 +// W32: v_cndmask_b16 v5.l, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, 0.5, -1, vcc_hi -// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo +// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -|src_scc|, null, ttmp15 -// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +v_cndmask_b16 v5.l, 0.5, -1, vcc_hi +// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, v1, src_scc, s[6:7] -// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] -// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction +v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 +// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction -v_cndmask_b16 v5, v255, 0.5, s[6:7] -// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] -// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction +v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] +// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction -v_cndmask_b16 v5, s105, s105, s[6:7] -// W64: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] +// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction -v_cndmask_b16 v5, vcc_hi, v2, s[6:7] -// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, s105, s105, s[6:7] +// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] -// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] +// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, m0, v255, s[6:7] -// W64: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] +// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] -// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, m0, v255.l, s[6:7] +// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] -// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] +// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, null, m0, s[6:7] -// W64: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] +// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] -// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, null, m0, s[6:7] +// W64: v_cndmask_b16 v5.l, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] +// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, 0.5, -1, vcc +// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] +// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction + +v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null +// GFX11: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo +// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, 0x3800, -1, vcc +// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, v255.h, 0.5, s3 +// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3 ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction + +v_cndmask_b16 v5.l, m0, v255.h, s3 +// W32: v_cndmask_b16 v5.l, m0, v255.h, s3 ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, 0.5, -1, vcc -// W64: v_cndmask_b16 v5, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] +// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction -v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] -// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +v_cndmask_b16 v5.l, m0, v255.h, s[6:7] +// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7] ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null -// GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null +// GFX11: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_cubeid_f32 v5, v1, v2, s3 // GFX11: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s index 5fa1334aa6e95..2bff644605ff6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s @@ -765,112 +765,139 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 -// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 -// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s index 2fc02061c59de..2f9b5efca9e17 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s @@ -424,44 +424,71 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 -// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index 3e7b7d28c2e97..cd4ed2b9458e6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -857,104 +857,131 @@ v_bfm_b32 v5, src_scc, vcc_lo v_bfm_b32 v255, 0xaf123456, vcc_hi // GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cndmask_b16 v5, v1, src_scc, s3 -// W32: v_cndmask_b16 v5, v1, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] -// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction +v_cndmask_b16 v5.l, v1.l, src_scc, s3 +// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction -v_cndmask_b16 v5, v255, 0.5, s3 -// W32: v_cndmask_b16 v5, v255, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] -// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.l, 0.5, s3 +// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction -v_cndmask_b16 v5, s105, s105, s3 -// W32: v_cndmask_b16 v5, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction - -v_cndmask_b16 v5, vcc_hi, v2, s3 -// W32: v_cndmask_b16 v5, vcc_hi, v2, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, s105, s105, s3 +// W32: v_cndmask_b16 v5.l, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, ttmp15, ttmp15, s3 -// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 +// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, m0, v255, s3 -// W32: v_cndmask_b16 v5, m0, v255, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 +// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_lo, exec_lo, s3 -// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, m0, v255.l, s3 +// W32: v_cndmask_b16 v5.l, m0, v255.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_hi, exec_hi, s3 -// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 +// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, null, m0, s105 -// W32: v_cndmask_b16 v5, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 +// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo -// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, null, m0, s105 +// W32: v_cndmask_b16 v5.l, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, 0.5, -1, vcc_hi -// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] -// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo +// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -|src_scc|, null, ttmp15 -// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +v_cndmask_b16 v5.l, 0.5, -1, vcc_hi +// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, v1, src_scc, s[6:7] -// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] -// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction +v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 +// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction -v_cndmask_b16 v5, v255, 0.5, s[6:7] -// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] -// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction +v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] +// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction -v_cndmask_b16 v5, s105, s105, s[6:7] -// W64: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] +// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction -v_cndmask_b16 v5, vcc_hi, v2, s[6:7] -// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, s105, s105, s[6:7] +// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] -// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] +// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, m0, v255, s[6:7] -// W64: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] +// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] -// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, m0, v255.l, s[6:7] +// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] -// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] +// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, null, m0, s[6:7] -// W64: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] +// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] -// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, null, m0, s[6:7] +// W64: v_cndmask_b16 v5.l, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] +// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, 0.5, -1, vcc +// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] +// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction + +v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null +// GFX12: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] -v_cndmask_b16 v5, 0.5, -1, vcc -// W64: v_cndmask_b16 v5, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] -// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction +v_cndmask_b16 v5.l, v255.h, 0.5, s3 +// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3 ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction + +v_cndmask_b16 v5.l, m0, v255.h, s3 +// W32: v_cndmask_b16 v5.l, m0, v255.h, s3 ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] +// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction + +v_cndmask_b16 v5.l, m0, v255.h, s[6:7] +// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7] ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo +// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] -// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +v_cndmask_b16 v5.l, 0x3800, -1, vcc +// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction -v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null -// GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null +// GFX12: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_cubeid_f32 v5, v1, v2, s3 // GFX12: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index aa804cc302bf0..78ce7451c1ba7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -869,128 +869,147 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 -// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror -// W64: v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 -// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index e93a65ec92e73..b41f92b889368 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -516,56 +516,75 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] -// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] -// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 -// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 -// W64: v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05] -// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction +v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction -v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt index adcca58776100..05174e3128919 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt @@ -1054,55 +1054,100 @@ # GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00 -# W32: v_cndmask_b16 v5, v1, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] -# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00 -# W32: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] -# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00 -# W32: v_cndmask_b16 v5, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] -# W64: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00 -# W32: v_cndmask_b16 v5, vcc_hi, v2, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] -# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00 -# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] -# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00 -# W32: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] -# W64: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00 -# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] -# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00 -# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] -# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00 -# W32: v_cndmask_b16 v5, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] -# W64: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41 -# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] -# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01 -# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] -# W64: v_cndmask_b16 v5, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21 -# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] -# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00 +# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6 ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] + +0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00 +# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6 ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7] ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] + +0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00 # GFX11: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt index 2964360a77fd2..c9ef3c714213d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt @@ -738,65 +738,118 @@ # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] 0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30 -# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30] 0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30 -# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30] 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30 -# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] + +0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] + +0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] + +0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt index 7a81ba23afa35..1e74b5aec0cf3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt @@ -396,29 +396,64 @@ # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] 0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00 -# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00] 0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00 -# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00] 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00 -# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] + +0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] + +0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] + +0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00 +# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 633d3a48634fa..4108fd9c8be62 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -1018,55 +1018,100 @@ # GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00 -# W32: v_cndmask_b16 v5, v1, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] -# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00 -# W32: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] -# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00 -# W32: v_cndmask_b16 v5, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] -# W64: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00 -# W32: v_cndmask_b16 v5, vcc_hi, v2, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] -# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00 -# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] -# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00 -# W32: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] -# W64: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00 -# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] -# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00 -# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] -# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00 -# W32: v_cndmask_b16 v5, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] -# W64: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41 -# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] -# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01 -# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] -# W64: v_cndmask_b16 v5, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21 -# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] -# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00 +# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6 ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] + +0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00 +# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6 ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7] ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] + +0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 7e30a4a2096b1..0be540da8287b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -789,59 +789,106 @@ # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01] 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13] 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30 -# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30] + +0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] + +0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13] + +0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index 2aaba2a17fae6..343a71abb27d0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -447,23 +447,52 @@ # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05 -# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] -# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05] 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00 -# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00] + +0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] + +0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05 +# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05] + +0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00 +# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] From dbcb1f8a48f8ff38f62117bd4dcfe016cda24b9a Mon Sep 17 00:00:00 2001 From: guochen2 Date: Thu, 16 Jan 2025 14:11:37 -0500 Subject: [PATCH 2/2] address PR comment --- llvm/lib/Target/AMDGPU/SIInstructions.td | 13 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 135 ++-- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 187 +++-- llvm/test/CodeGen/AMDGPU/bf16.ll | 631 ++++++++------- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 126 +-- .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 156 ++-- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 92 +-- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 105 ++- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 105 ++- .../AMDGPU/fmul-2-combine-multi-use.ll | 27 +- llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 144 ++-- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 31 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 101 ++- llvm/test/CodeGen/AMDGPU/fract-match.ll | 56 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 143 ++-- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 421 +++++----- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 421 +++++----- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 22 +- llvm/test/CodeGen/AMDGPU/lround.ll | 8 +- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 86 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 86 +- .../AMDGPU/select-fabs-fneg-extract.f16.ll | 190 ++--- .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 673 ++++++++-------- .../AMDGPU/select-flags-to-fmin-fmax.ll | 252 +++--- llvm/test/CodeGen/AMDGPU/select.f16.ll | 740 +++++++++--------- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 8 +- 26 files changed, 2464 insertions(+), 2495 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c67c73649fc3d..1abbf4c217a69 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1249,25 +1249,18 @@ class VOPSelectPat_t16 : GCNPat < (vt (select i1:$src0, vt:$src1, vt:$src2)), (V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0) >; -class VOPSelectPat_fake16 : GCNPat < - (vt (select i1:$src0, vt:$src1, vt:$src2)), - (V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0) ->; def : VOPSelectModsPat ; def : VOPSelectModsPat ; -let True16Predicate = NotHasTrue16BitInsts in { +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : VOPSelectPat ; def : VOPSelectPat ; -} // End True16Predicate = NotHasTrue16BitInsts +} // End True16Predicate = p let True16Predicate = UseRealTrue16Insts in { def : VOPSelectPat_t16 ; def : VOPSelectPat_t16 ; } // End True16Predicate = UseRealTrue16Insts -let True16Predicate = UseFakeTrue16Insts in { - def : VOPSelectPat_fake16 ; - def : VOPSelectPat_fake16 ; -} // End True16Predicate = UseFakeTrue16Insts let AddedComplexity = 1 in { def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index e27d4372d87be..e289ee759da15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s10 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX11-NEXT: s_ashr_i32 s0, s9, 31 -; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_ashr_i32 s0, s9, 31 ; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 @@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1] -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: v_cndmask_b16 v3, v7, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v2, v2, 0, s0 -; GFX11-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -5606,22 +5606,21 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s1 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo @@ -5847,33 +5846,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[0:1] -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[10:11] +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[16:17], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11] -; GFX11-NEXT: v_add_co_u32 v10, s1, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, s1, v5, v13, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s1, v6, v14, s1 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s0 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[10:11], v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, s1, v7, v15, s1 -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[12:13], v[6:7] -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v13 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v2, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5883,10 +5882,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6244,16 +6243,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s18 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: s_add_u32 s0, s4, s12 -; GFX11-NEXT: v_cndmask_b16 v1, v2, 0, s1 ; GFX11-NEXT: s_addc_u32 s1, s5, s13 ; GFX11-NEXT: s_addc_u32 s2, s6, s14 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] @@ -6269,18 +6268,17 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s12 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX11-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_and_b32 s5, 1, s5 -; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 -; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4 -; GFX11-NEXT: s_ashr_i32 s4, s3, 31 -; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX11-NEXT: v_cndmask_b16 v2, v3, 0, s5 +; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 ; GFX11-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_ashr_i32 s4, s3, 31 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: v_mov_b32_e32 v2, s17 @@ -6289,6 +6287,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index af96da1bb25ad..43ebe156eb2a2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5297,28 +5297,28 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: s_sub_u32 s8, s0, s4 ; GFX11-NEXT: s_subb_u32 s9, s1, s5 ; GFX11-NEXT: s_subb_u32 s10, s2, s6 -; GFX11-NEXT: s_subb_u32 s11, s3, s7 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] +; GFX11-NEXT: s_subb_u32 s11, s3, s7 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s12 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: s_ashr_i32 s0, s11, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX11-NEXT: s_ashr_i32 s0, s11, 31 -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 ; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5470,26 +5470,25 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[0:1] +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX11-NEXT: v_cndmask_b16 v2, v9, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5639,29 +5638,29 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s4 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5897,38 +5896,37 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[8:9] -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[16:17], v[0:1] ; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[18:19], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 -; GFX11-NEXT: v_sub_co_u32 v8, s1, v4, v12 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v9, s1, v5, v13, s1 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v10, s1, v6, v14, s1 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, s1, v7, v15, s1 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[8:9], v[4:5] -; GFX11-NEXT: v_cmp_lt_u64_e64 s1, 0, v[12:13] -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[10:11], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[10:11], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v2, v5, v4, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5940,8 +5938,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6305,57 +6303,57 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_sub_u32 s18, s0, s8 ; GFX11-NEXT: s_subb_u32 s19, s1, s9 ; GFX11-NEXT: s_subb_u32 s16, s2, s10 -; GFX11-NEXT: s_subb_u32 s17, s3, s11 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] +; GFX11-NEXT: s_subb_u32 s17, s3, s11 ; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[16:17], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s20 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s8, s17, 31 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 ; GFX11-NEXT: s_sub_u32 s0, s4, s12 -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s1 ; GFX11-NEXT: s_subb_u32 s1, s5, s13 ; GFX11-NEXT: s_subb_u32 s2, s6, s14 -; GFX11-NEXT: s_subb_u32 s3, s7, s15 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: s_subb_u32 s3, s7, s15 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 -; GFX11-NEXT: s_cselect_b32 s10, 1, 0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX11-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s10 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX11-NEXT: s_and_b32 s5, 1, s5 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_and_b32 s5, 1, s5 +; GFX11-NEXT: s_ashr_i32 s4, s3, 31 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 -; GFX11-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 -; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, s4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_mov_b32_e32 v7, s3 -; GFX11-NEXT: v_cndmask_b16 v2, v4, v3, s5 -; GFX11-NEXT: v_mov_b32_e32 v3, s18 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_ashr_i32 s4, s3, 31 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, s19 ; GFX11-NEXT: v_mov_b32_e32 v2, s17 @@ -6364,7 +6362,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo -; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 12e677e5546fd..8e3c905b0eae5 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -34525,7 +34525,7 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select i1 %cond, bfloat %a, bfloat %b ret bfloat %op @@ -34598,10 +34598,10 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg bfloat %a %op = select i1 %cond, bfloat %neg.a, bfloat %b @@ -34675,10 +34675,10 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg bfloat %b %op = select i1 %cond, bfloat %a, bfloat %neg.b @@ -34782,15 +34782,14 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX11FAKE16-LABEL: v_select_v2bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op @@ -34899,15 +34898,14 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v5, v4, s0 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b @@ -34977,10 +34975,10 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; ; GFX11FAKE16-LABEL: s_select_bf16: ; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s1, v0, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 @@ -35099,13 +35097,13 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s0 -; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 +; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s1, v0, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog @@ -35220,16 +35218,16 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11FAKE16-LABEL: s_vselect_v2bf16: ; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s0 +; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s0, v0, s2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s1, v1, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog @@ -36958,27 +36956,30 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX11FAKE16-LABEL: s_vselect_v4bf16: ; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_lshr_b32 s7, s1, 16 -; GFX11FAKE16-NEXT: s_lshr_b32 s9, s0, 16 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 -; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11FAKE16-NEXT: s_lshr_b32 s8, s3, 16 +; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 +; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo +; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, s8, v0, s6 -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, s0, v1, s4 -; GFX11FAKE16-NEXT: v_cndmask_b16 v2, s2, v2, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v3, s3, v3, s5 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11FAKE16-NEXT: ; return to shader part epilog %cond = icmp eq <4 x i32> %c, zeroinitializer %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b @@ -37158,28 +37159,25 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX11FAKE16-LABEL: v_vselect_v4bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v7, v5, s2 -; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v6, v4, s0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v3, v0, s1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v4, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v2, v5, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op @@ -37473,45 +37471,43 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX11FAKE16-LABEL: v_vselect_v8bf16: ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v7 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4 -; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v15, v11, s2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v14, v10, s5 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v13, v9, s3 -; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v12, v8, s0 -; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v7, v6, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v10, v5, s1 -; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v11, v4, s4 -; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v1, v0, s6 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v6, v8, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v3, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op @@ -38177,80 +38173,80 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 +; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 -; GFX11FAKE16-NEXT: v_cndmask_b16 v1, v25, v17, s2 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v24, v16, s0 -; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v54, v53, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v52, v51, s1 -; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v30, v22, s10 -; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v34, v33, s11 -; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v29, v21, s12 -; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v36, v35, s9 -; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v28, v20, s8 -; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v38, v37, s7 -; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v27, v19, s6 -; GFX11FAKE16-NEXT: v_cndmask_b16 v2, v26, v18, s4 -; GFX11FAKE16-NEXT: v_cndmask_b16 v12, v50, v49, s3 -; GFX11FAKE16-NEXT: v_cndmask_b16 v13, v48, v39, s5 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v10, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v3, v13, v3, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v31 -; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v31, v23, s14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v14, v32, s13 -; GFX11FAKE16-NEXT: v_perm_b32 v7, v11, v10, 0x5040100 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op @@ -39942,168 +39938,167 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8 ; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68 ; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28 ; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v31 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11FAKE16-NEXT: v_cndmask_b16 v15, v32, v33, s26 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11FAKE16-NEXT: v_cndmask_b16 v14, v34, v35, s29 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11FAKE16-NEXT: v_cndmask_b16 v13, v36, v37, s27 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v37 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11FAKE16-NEXT: v_cndmask_b16 v12, v38, v39, s24 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11FAKE16-NEXT: v_cndmask_b16 v11, v48, v49, s22 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v49 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v48 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11FAKE16-NEXT: v_cndmask_b16 v16, v50, v51, s20 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v51 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v50 -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11FAKE16-NEXT: v_cndmask_b16 v19, v52, v53, s18 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v53 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v52 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11FAKE16-NEXT: v_cndmask_b16 v22, v54, v55, s16 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11FAKE16-NEXT: v_cndmask_b16 v25, v64, v65, s14 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v65 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v64 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19 +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v67 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v66 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v69 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v68 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v71 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v70 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v81 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v80 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v83 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v82 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v85 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v84 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11FAKE16-NEXT: v_cndmask_b16 v54, v86, v87, s0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v87 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v86 -; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11FAKE16-NEXT: v_cndmask_b16 v28, v66, v67, s12 -; GFX11FAKE16-NEXT: v_cndmask_b16 v31, v68, v69, s10 -; GFX11FAKE16-NEXT: v_cndmask_b16 v34, v70, v71, s8 -; GFX11FAKE16-NEXT: v_cndmask_b16 v37, v80, v81, s6 -; GFX11FAKE16-NEXT: v_cndmask_b16 v48, v82, v83, s4 -; GFX11FAKE16-NEXT: v_cndmask_b16 v51, v84, v85, s2 -; GFX11FAKE16-NEXT: v_cndmask_b16 v65, v4, v3, s28 -; GFX11FAKE16-NEXT: v_cndmask_b16 v66, v6, v5, s25 -; GFX11FAKE16-NEXT: v_cndmask_b16 v67, v8, v7, s23 -; GFX11FAKE16-NEXT: v_cndmask_b16 v68, v10, v9, s21 -; GFX11FAKE16-NEXT: v_cndmask_b16 v10, v18, v17, s19 -; GFX11FAKE16-NEXT: v_cndmask_b16 v9, v21, v20, s17 -; GFX11FAKE16-NEXT: v_cndmask_b16 v8, v24, v23, s15 -; GFX11FAKE16-NEXT: v_cndmask_b16 v7, v27, v26, s13 -; GFX11FAKE16-NEXT: v_cndmask_b16 v6, v30, v29, s11 -; GFX11FAKE16-NEXT: v_cndmask_b16 v5, v33, v32, s9 -; GFX11FAKE16-NEXT: v_cndmask_b16 v4, v36, v35, s7 -; GFX11FAKE16-NEXT: v_cndmask_b16 v0, v64, v55, vcc_lo -; GFX11FAKE16-NEXT: v_cndmask_b16 v3, v53, v52, s1 -; GFX11FAKE16-NEXT: v_cndmask_b16 v17, v50, v49, s3 -; GFX11FAKE16-NEXT: v_cndmask_b16 v18, v39, v38, s5 -; GFX11FAKE16-NEXT: v_cndmask_b16 v20, v2, v1, s0 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v54, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v51, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v2, v17, v48, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v3, v18, v37, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v34, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v31, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v28, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v25, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v22, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v19, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v16, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v11, v68, v11, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v12, v67, v12, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v65, v14, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87 +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100 +; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index beedc60225947..b128be2186df2 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1840,14 +1840,14 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX11-SDAG-LABEL: fmul_select_v2f16_test3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x4000 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3c00, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1967,14 +1967,14 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX11-SDAG-LABEL: fmul_select_v2f16_test4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x3800 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3c00, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2121,20 +2121,20 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11-SDAG-LABEL: fmul_select_f16_test6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0xc800 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4200, v1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: fmul_select_f16_test6: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x4200 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0xc800, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -2209,20 +2209,20 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11-SDAG-LABEL: fmul_select_f16_test7: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x4800 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc400, v1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: fmul_select_f16_test7: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0xc400 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0x4800, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -2276,7 +2276,7 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0, 0x8000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2625,7 +2625,7 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -2740,7 +2740,7 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -2885,23 +2885,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3f80, v5, s0 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v5, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -3043,23 +3044,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0x3f80, v5, s0 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3f80, v5, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -3167,7 +3169,7 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4100, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3283,7 +3285,7 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4040, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3300,10 +3302,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-LABEL: fmul_select_bf16_test6: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x4040 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0xc100, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -3399,7 +3401,7 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc080, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3416,10 +3418,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-LABEL: fmul_select_bf16_test7: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0xc080 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v1, v1, 0x4100, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 @@ -3530,7 +3532,7 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b16 v1, 0, 0x8000, vcc_lo +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3630,7 +3632,7 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xc200, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3746,7 +3748,7 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xdb80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -3858,7 +3860,7 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 7187801e5990b..efbbe2b27f10f 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -450,22 +450,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_4: ; %exit -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x3900 -; GFX11-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v1, s0 -; GFX11-NEXT: v_cndmask_b16 v2, 0x3d00, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3900, v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -1064,22 +1062,20 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_4: ; %exit -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x3900 -; GFX11-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v1, s0 -; GFX11-NEXT: v_cndmask_b16 v2, 0x3d00, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3900, v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -1410,34 +1406,34 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_4: ; %exit -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x3d00 +; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x3801, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cmp_lt_u16_e64 s1, 0x3800, v1 -; GFX11-NEXT: v_cndmask_b16 v7, 0x3900, v5, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3900 -; GFX11-NEXT: v_cndmask_b16 v8, 0x3900, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3 -; GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x3801, v2 -; GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x3801, v0 -; GFX11-NEXT: v_cmp_gt_u16_e64 s3, 0x3801, v6 -; GFX11-NEXT: v_cmp_gt_u16_e64 s34, 0x3801, v4 -; GFX11-NEXT: v_cndmask_b16 v2, 0x3900, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v5, s0 -; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v5, s2 -; GFX11-NEXT: v_cndmask_b16 v4, 0x3900, v5, s3 -; GFX11-NEXT: v_cndmask_b16 v5, 0x3900, v5, s34 -; GFX11-NEXT: v_cndmask_b16 v6, 0x3d00, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v5, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v6, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8 +; GFX11-NEXT: v_perm_b32 v2, v7, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 %cond, label %T, label %F @@ -1701,34 +1697,34 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_4: ; %exit -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x3d00 +; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cmp_nge_f16_e64 s1, 0.5, v1 -; GFX11-NEXT: v_cndmask_b16 v7, 0x3900, v5, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3900 -; GFX11-NEXT: v_cndmask_b16 v8, 0x3900, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s3, 0.5, v6 -; GFX11-NEXT: v_cmp_ge_f16_e64 s34, 0.5, v4 -; GFX11-NEXT: v_cndmask_b16 v2, 0x3900, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v5, s0 -; GFX11-NEXT: v_cndmask_b16 v3, 0x3900, v5, s2 -; GFX11-NEXT: v_cndmask_b16 v4, 0x3900, v5, s3 -; GFX11-NEXT: v_cndmask_b16 v5, 0x3900, v5, s34 -; GFX11-NEXT: v_cndmask_b16 v6, 0x3d00, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX11-NEXT: v_pack_b32_f16 v1, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_pack_b32_f16 v2, v8, v5 -; GFX11-NEXT: v_pack_b32_f16 v3, v7, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v8 +; GFX11-NEXT: v_pack_b32_f16 v2, v4, v7 +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v3, v5, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 %cond, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index ffe7649e4bbb1..c3c1540383ec6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -645,36 +645,36 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 5 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s4, 7 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -865,69 +865,69 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16 ; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v9, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v9, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 5 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s4, 7 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s4, 9 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v4, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s4, 11 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v5, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s4, 13 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v6, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s4, 15 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v7, s2 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 91f57e644ec72..e874ee56f594c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -62,7 +62,7 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16: @@ -151,12 +151,11 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -263,13 +262,12 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -397,23 +395,22 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v0, v2 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s1, v5, v4 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s2, v7, v6 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -618,36 +615,36 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s0, v11, v10 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s1, v13, v12 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s3, v0, v4 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s2, v15, v14 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s4, v1, v5 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s5, v2, v6 -; GFX11-SAFE-NEXT: v_cmp_nle_f16_e64 s6, v3, v7 -; GFX11-SAFE-NEXT: v_cndmask_b16 v12, v12, v13, s1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v14, v14, v15, s2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v10, v10, v11, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v8, v8, v9, vcc_lo -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v4, v0, s3 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v5, v1, s4 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v6, v2, s5 -; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v7, v3, s6 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index b7e9e15a0561f..0723290bdf734 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -63,7 +63,7 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16: @@ -152,12 +152,11 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -264,13 +263,12 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v5, v4 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -398,23 +396,22 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -619,36 +616,36 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s0, v11, v10 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, v13, v12 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s3, v0, v4 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s2, v15, v14 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s4, v1, v5 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s5, v2, v6 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s6, v3, v7 -; GFX11-SAFE-NEXT: v_cndmask_b16 v12, v12, v13, s1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v14, v14, v15, s2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v10, v10, v11, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v8, v8, v9, vcc_lo -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v4, v0, s3 -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, v5, v1, s4 -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, v6, v2, s5 -; GFX11-SAFE-NEXT: v_cndmask_b16 v3, v7, v3, s6 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo ; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-NEXT: v_perm_b32 v2, v12, v2, 0x5040100 -; GFX11-SAFE-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 6c1a7ac56a867..64be9cb72a6ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -425,16 +425,15 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s0, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s1, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 s2, |v0|, |v1| -; GFX11-DENORM-NEXT: v_cndmask_b16 v0, v1, v0, s2 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| +; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] @@ -445,18 +444,18 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s0, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s1, -1.0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 s0, |v0|, |v1| -; GFX11-FLUSH-NEXT: v_cndmask_b16 v0, v1, v0, s0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| +; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v0, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v1, v0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index bef424c5287fe..9ae60f99d5e09 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -4947,9 +4947,9 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5400 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5009,9 +5009,9 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5071,9 +5071,9 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0xd400 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5133,9 +5133,9 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0xbc00 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5202,9 +5202,9 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5800 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,9 +5267,9 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x5800 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5336,9 +5336,9 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x4000 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5401,9 +5401,9 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) { ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5800, v0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo ; GFX11-SDAG-NEXT: v_fma_f16 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5478,10 +5478,10 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5560,10 +5560,10 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5644,10 +5644,10 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5738,10 +5738,10 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5838,10 +5838,10 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5928,10 +5928,10 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xcc00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xcc00, v3, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6013,10 +6013,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x3c00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x3c00, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6099,10 +6099,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6187,10 +6187,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xbc00, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xbc00, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6276,10 +6276,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0xd400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0xd400, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6374,10 +6374,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5400, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6464,10 +6464,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4400, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6561,10 +6561,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x4400, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x4400, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6651,10 +6651,10 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b16 v0, 0x5800, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cndmask_b16 v1, 0x5800, v4, s0 -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 589804177a747..b32630a97b3ad 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -558,12 +558,12 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 -; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, v0, s0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-SAFE-NEXT: ; return to shader part epilog ; ; GFX11-NSZ-LABEL: fneg_fadd_0_f16: @@ -573,10 +573,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv half 1.000000e+00, %tmp6 @@ -646,12 +646,13 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; ; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16: ; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x8000, v0, s1 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-SAFE-NEXT: ; return to shader part epilog ; ; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16: @@ -661,10 +662,10 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x7e00, 0, vcc_lo +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn half 1.000000e+00, %tmp6 @@ -3834,7 +3835,7 @@ define half @v_fneg_round_f16(half %a) #0 { ; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 @@ -3849,7 +3850,7 @@ define half @v_fneg_round_f16(half %a) #0 { ; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 @@ -4676,7 +4677,7 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 0077951c4967e..b2d30b751ae2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -164,7 +164,7 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, i16 %arg0, i16 %arg1 @@ -210,15 +210,14 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, v5, v4, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 @@ -257,7 +256,7 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; GFX11-NEXT: global_store_b16 v[3:4], v1, off ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -583,16 +582,16 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = fneg half %arg0 %select0 = select i1 %cond0, half %arg1, half %fneg0 @@ -619,16 +618,16 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 +; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i16 %arg0, -32768 %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0 @@ -703,30 +702,29 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX11-LABEL: select_fneg_select_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v4, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_perm_b32 v4, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v1, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, v1, v4, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = fneg <2 x half> %arg0 %select0 = select <2 x i1> %cond0, <2 x half> %arg1, <2 x half> %fneg0 @@ -789,30 +787,29 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX11-LABEL: select_fneg_xor_select_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v4, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_perm_b32 v4, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v1, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, v1, v4, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor <2 x i16> %arg0, %select0 = select <2 x i1> %cond0, <2 x i16> %arg1, <2 x i16> %fneg0 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index e6513fa7b920f..80b4d64b1236f 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2366,10 +2366,10 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f16_e32 v3, v0 -; GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x7c00, |v0| +; GFX11-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX11-NEXT: v_floor_f16_e32 v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b16 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2381,10 +2381,10 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_fract_f16_e32 v3, v0 -; GFX12-NEXT: v_cmp_neq_f16_e64 s0, 0x7c00, |v0| +; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX12-NEXT: v_floor_f16_e32 v4, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v0, 0, v3, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b16 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -2539,19 +2539,19 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_fract_f16_e32 v4, v0 -; GFX11-NEXT: v_cmp_class_f16_e64 s1, v0, 0x204 -; GFX11-NEXT: v_floor_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fract_f16_e32 v5, v3 +; GFX11-NEXT: v_fract_f16_e32 v6, v0 +; GFX11-NEXT: v_floor_f16_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fract_f16_e32 v4, v3 ; GFX11-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 -; GFX11-NEXT: v_floor_f16_e32 v3, v3 -; GFX11-NEXT: v_cndmask_b16 v4, v4, 0, s1 -; GFX11-NEXT: v_cndmask_b16 v5, v5, 0, s0 +; GFX11-NEXT: v_floor_f16_e32 v7, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX11-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v3, v0, v3 -; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 -; GFX11-NEXT: global_store_b32 v[1:2], v3, off +; GFX11-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v4, off +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_v2f16: @@ -2562,19 +2562,19 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_fract_f16_e32 v4, v0 -; GFX12-NEXT: v_cmp_class_f16_e64 s1, v0, 0x204 -; GFX12-NEXT: v_floor_f16_e32 v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_fract_f16_e32 v5, v3 +; GFX12-NEXT: v_fract_f16_e32 v6, v0 +; GFX12-NEXT: v_floor_f16_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_fract_f16_e32 v4, v3 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 -; GFX12-NEXT: v_floor_f16_e32 v3, v3 -; GFX12-NEXT: v_cndmask_b16 v4, v4, 0, s1 -; GFX12-NEXT: v_cndmask_b16 v5, v5, 0, s0 +; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_pack_b32_f16 v3, v0, v3 -; GFX12-NEXT: v_pack_b32_f16 v0, v4, v5 -; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index da891a709ac1c..d09af8fd2ac95 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2513,37 +2513,36 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 4 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-NEXT: s_cselect_b32 s10, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: s_cselect_b32 s5, -1, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b16 v5, v3, s4, s2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cndmask_b16 v6, v2, s4, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v7, v1, s4, s8 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v0, s4, s10 -; GFX11-NEXT: v_cndmask_b16 v3, v3, s4, s3 -; GFX11-NEXT: v_cndmask_b16 v2, v2, s4, s7 -; GFX11-NEXT: v_cndmask_b16 v1, v1, s4, s9 -; GFX11-NEXT: v_cndmask_b16 v8, v8, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 ; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm @@ -3083,69 +3082,69 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 4 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-NEXT: s_cselect_b32 s10, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: s_cselect_b32 s11, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 14 -; GFX11-NEXT: s_cselect_b32 s12, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 15 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 12 -; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 13 -; GFX11-NEXT: s_cselect_b32 s15, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 10 -; GFX11-NEXT: s_cselect_b32 s16, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 11 -; GFX11-NEXT: s_cselect_b32 s17, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 8 -; GFX11-NEXT: s_cselect_b32 s18, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 9 -; GFX11-NEXT: s_cselect_b32 s5, -1, 0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cndmask_b16 v9, v3, s4, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b16 v13, v7, s4, s12 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_cndmask_b16 v14, v6, s4, s14 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_cndmask_b16 v15, v5, s4, s16 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_cndmask_b16 v16, v4, s4, s18 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cndmask_b16 v10, v2, s4, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v11, v1, s4, s8 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cndmask_b16 v12, v0, s4, s10 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b16 v7, v7, s4, s13 -; GFX11-NEXT: v_cndmask_b16 v6, v6, s4, s15 -; GFX11-NEXT: v_cndmask_b16 v5, v5, s4, s17 -; GFX11-NEXT: v_cndmask_b16 v4, v4, s4, s5 -; GFX11-NEXT: v_cndmask_b16 v3, v3, s4, s3 -; GFX11-NEXT: v_cndmask_b16 v2, v2, s4, s7 -; GFX11-NEXT: v_cndmask_b16 v1, v1, s4, s9 -; GFX11-NEXT: v_cndmask_b16 v0, v0, s4, s11 -; GFX11-NEXT: v_perm_b32 v7, v7, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v5, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 +; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 60c3f8f60bccc..1d0367db70143 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -60,10 +60,10 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX11-LABEL: v_maximum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16: @@ -180,10 +180,10 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX11-LABEL: v_maximum_f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nsz: @@ -306,10 +306,10 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nnan_src0: @@ -387,10 +387,10 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_f16__nnan_src1: @@ -485,10 +485,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX11-LABEL: s_maximum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_max_f16_e64 v0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -590,17 +590,17 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_maximum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16: @@ -749,17 +749,17 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_maximum_v2f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16__nsz: @@ -939,18 +939,17 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-LABEL: s_maximum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_pk_max_f16 v0, s0, s1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_lshr_b32 s2, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -1064,20 +1063,21 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_maximum_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16: @@ -1255,20 +1255,21 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_maximum_v3f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16__nsz: @@ -1468,26 +1469,26 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_maximum_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v8, v0, v2 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16: @@ -1694,26 +1695,26 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_maximum_v4f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v8, v0, v2 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16__nsz: @@ -1998,44 +1999,44 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX11-LABEL: v_maximum_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 ; GFX11-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-NEXT: v_pk_max_f16 v10, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_pk_max_f16 v14, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 -; GFX11-NEXT: v_pk_max_f16 v9, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v10, 0x7e00, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-NEXT: v_pk_max_f16 v11, v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v9, s0 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v12, v0, v4 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v6, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s3, v0, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s4, v11, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v12, s3 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s4 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s2 -; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, s0 -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v4, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v8f16: @@ -2401,78 +2402,86 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX11-LABEL: v_maximum_v16f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_pk_max_f16 v18, v7, v15 +; GFX11-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v6, v14 ; GFX11-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v17, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GFX11-NEXT: v_cndmask_b16 v16, 0x7e00, v18, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: v_pk_max_f16 v20, v5, v13 -; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v7, s0 -; GFX11-NEXT: v_cndmask_b16 v17, 0x7e00, v15, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-NEXT: v_pk_max_f16 v20, v4, v12 +; GFX11-NEXT: v_pk_max_f16 v22, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v13 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v19, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v4, v12 -; GFX11-NEXT: v_pk_max_f16 v13, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX11-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_pk_max_f16 v17, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v6, 0x7e00, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v14, 0x7e00, v20, s0 -; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v5, s1 -; GFX11-NEXT: v_cndmask_b16 v15, 0x7e00, v13, s2 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 -; GFX11-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 -; GFX11-NEXT: v_pk_max_f16 v13, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 +; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 +; GFX11-NEXT: v_pk_max_f16 v19, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v18, 0x7e00, v12, s0 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GFX11-NEXT: v_cndmask_b16 v11, 0x7e00, v13, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: v_pk_max_f16 v20, v0, v8 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v9 -; GFX11-NEXT: v_cmp_o_f16_e64 s3, v10, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s4, v0, v8 -; GFX11-NEXT: v_cmp_o_f16_e64 s5, v19, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v13 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s2 -; GFX11-NEXT: v_cndmask_b16 v9, 0x7e00, v20, s4 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s5 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s3 -; GFX11-NEXT: v_cndmask_b16 v8, 0x7e00, v8, s1 -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, s0 -; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v8, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_pk_max_f16 v22, v0, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 +; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 99d0916ae7a28..f8c2c54af2783 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -48,10 +48,10 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX11-LABEL: v_minimum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16: @@ -145,10 +145,10 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX11-LABEL: v_minimum_f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nsz: @@ -247,10 +247,10 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nnan_src0: @@ -314,10 +314,10 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_f16__nnan_src1: @@ -395,10 +395,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX11-LABEL: s_minimum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_min_f16_e64 v0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -480,17 +480,17 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_minimum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16: @@ -604,17 +604,17 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX11-LABEL: v_minimum_v2f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16__nsz: @@ -752,18 +752,17 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-LABEL: s_minimum_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s2, s0, s1 ; GFX11-NEXT: v_pk_min_f16 v0, s0, s1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_lshr_b32 s2, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 @@ -850,20 +849,21 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_minimum_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16: @@ -994,20 +994,21 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX11-LABEL: v_minimum_v3f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v3 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16__nsz: @@ -1153,26 +1154,26 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_minimum_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v8, v0, v2 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16: @@ -1320,26 +1321,26 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX11-LABEL: v_minimum_v4f16__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v8, v0, v2 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v0, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v7, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s2 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16__nsz: @@ -1537,44 +1538,44 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX11-LABEL: v_minimum_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 ; GFX11-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-NEXT: v_pk_min_f16 v10, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_pk_min_f16 v14, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 -; GFX11-NEXT: v_pk_min_f16 v9, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v10, 0x7e00, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-NEXT: v_pk_min_f16 v11, v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v9, s0 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v2, v6 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v12, v0, v4 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v6, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s3, v0, v4 -; GFX11-NEXT: v_cmp_o_f16_e64 s4, v11, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s1 -; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v12, s3 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s4 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s2 -; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, s0 -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v4, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v8f16: @@ -1820,78 +1821,86 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX11-LABEL: v_minimum_v16f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_pk_min_f16 v18, v7, v15 +; GFX11-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v6, v14 ; GFX11-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v17, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GFX11-NEXT: v_cndmask_b16 v16, 0x7e00, v18, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: v_pk_min_f16 v20, v5, v13 -; GFX11-NEXT: v_cndmask_b16 v7, 0x7e00, v7, s0 -; GFX11-NEXT: v_cndmask_b16 v17, 0x7e00, v15, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-NEXT: v_pk_min_f16 v20, v4, v12 +; GFX11-NEXT: v_pk_min_f16 v22, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v5, v13 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v19, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v4, v12 -; GFX11-NEXT: v_pk_min_f16 v13, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX11-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-NEXT: v_pk_min_f16 v17, v3, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b16 v6, 0x7e00, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v14, 0x7e00, v20, s0 -; GFX11-NEXT: v_cndmask_b16 v5, 0x7e00, v5, s1 -; GFX11-NEXT: v_cndmask_b16 v15, 0x7e00, v13, s2 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 -; GFX11-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 -; GFX11-NEXT: v_pk_min_f16 v13, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 +; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 +; GFX11-NEXT: v_pk_min_f16 v19, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v18, 0x7e00, v12, s0 -; GFX11-NEXT: v_cmp_o_f16_e64 s0, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GFX11-NEXT: v_cndmask_b16 v11, 0x7e00, v13, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 s1, v2, v10 -; GFX11-NEXT: v_cmp_o_f16_e64 s2, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: v_pk_min_f16 v20, v0, v8 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v9 -; GFX11-NEXT: v_cmp_o_f16_e64 s3, v10, v2 -; GFX11-NEXT: v_cmp_o_f16_e64 s4, v0, v8 -; GFX11-NEXT: v_cmp_o_f16_e64 s5, v19, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v13 -; GFX11-NEXT: v_cndmask_b16 v1, 0x7e00, v1, s2 -; GFX11-NEXT: v_cndmask_b16 v9, 0x7e00, v20, s4 -; GFX11-NEXT: v_cndmask_b16 v0, 0x7e00, v0, s5 -; GFX11-NEXT: v_cndmask_b16 v2, 0x7e00, v2, s3 -; GFX11-NEXT: v_cndmask_b16 v8, 0x7e00, v8, s1 -; GFX11-NEXT: v_cndmask_b16 v3, 0x7e00, v3, s0 -; GFX11-NEXT: v_cndmask_b16 v4, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v8, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v3, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_pk_min_f16 v22, v0, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 +; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index ed029a3c6a259..c0a85bba93b73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -838,7 +838,7 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 ; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, 0, 0x3c00, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -980,20 +980,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 ; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s7, |v3|, 0.5 -; GFX11-NEXT: v_cmp_ge_f16_e64 s6, |v2|, 0.5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v3, 0, 0x3c00, s7 -; GFX11-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index f1678bb8ee4d4..072ee70b840d8 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -824,7 +824,7 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-SDAG-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SDAG-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0, v1, v0 @@ -839,7 +839,7 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 ; GFX11-GISEL-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v1, v0 @@ -915,7 +915,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-SDAG-NEXT: v_sub_f16_e32 v2, v0, v1 ; GFX11-SDAG-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0, v1, v0 @@ -933,7 +933,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 ; GFX11-GISEL-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b16 v2, 0, 0x3c00, s0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 ; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 4476d0f43ec4a..fa15a42aef2ac 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -236,19 +236,18 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, v1, v0, s0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 @@ -262,12 +261,13 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximumnum_bf16: @@ -278,19 +278,18 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v1, v1, v0, s0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 @@ -304,12 +303,13 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result @@ -369,18 +369,17 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximumnum_bf16_nnan: @@ -392,18 +391,17 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 0cdbec9dd094a..f5fb85d63b8e4 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -238,19 +238,18 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, v1, v0, s0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 @@ -264,12 +263,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimumnum_bf16: @@ -280,19 +280,18 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v1, v1, v0, s0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 @@ -306,12 +305,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b16 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result @@ -373,18 +373,17 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimumnum_bf16_nnan: @@ -396,18 +395,17 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) ret bfloat %result diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index e0ea9116e214f..7c1da18de70f8 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -34,7 +34,7 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -77,7 +77,7 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v1, |v1|, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v4 @@ -123,7 +123,7 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 @@ -169,7 +169,7 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v1, |v2|, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 @@ -212,10 +212,10 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -251,10 +251,10 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: add_select_fabs_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -289,10 +289,10 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_fabs_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -326,10 +326,10 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_posk_posk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, 0x4000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -367,7 +367,7 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -407,7 +407,7 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xe400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -443,7 +443,7 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -480,7 +480,7 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -518,7 +518,7 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -561,7 +561,7 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v1, v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -607,7 +607,7 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -653,7 +653,7 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v1, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 @@ -696,10 +696,10 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fneg_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -735,7 +735,7 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -773,7 +773,7 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0xb118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -811,7 +811,7 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3118, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -846,10 +846,10 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -884,10 +884,10 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_negliteralk_negliteralk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, 0xe800 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0xe800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xec00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -920,10 +920,10 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; GFX11-LABEL: add_select_fneg_negk_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0xc000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -959,7 +959,7 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -996,7 +996,7 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1033,7 +1033,7 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1072,11 +1072,11 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_negfabs_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1116,11 +1116,11 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_negfabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1160,11 +1160,11 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_neg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1203,11 +1203,11 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_fabs_neg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1245,10 +1245,10 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_neg_negfabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1287,10 +1287,10 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; GFX11-LABEL: add_select_negfabs_neg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1328,10 +1328,10 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: mul_select_negfabs_posk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1371,7 +1371,7 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1408,10 +1408,10 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; GFX11-LABEL: mul_select_negfabs_negk_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1451,7 +1451,7 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 @@ -1493,8 +1493,8 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; GFX11-SAFE-NEXT: v_add_f16_e32 v1, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16: @@ -1519,10 +1519,10 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, -4.0, v1 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, -4.0, v1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fadd half %x, 4.0 @@ -1558,8 +1558,8 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; GFX11-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: @@ -1584,10 +1584,10 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, 4.0, v1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fsub half %x, 4.0 @@ -1619,10 +1619,10 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; GFX11-LABEL: select_fneg_posk_src_mul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %mul = fmul half %x, 4.0 @@ -1660,8 +1660,8 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: @@ -1688,10 +1688,10 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_fma_f16 v0, v1, -4.0, -v2 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z) @@ -1730,8 +1730,8 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: @@ -1759,10 +1759,10 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16: ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_fma_f16 v0, v1, -4.0, -v2 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index d5b5b052e7ccb..d2bb971b68030 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -68,18 +68,18 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fabs_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -161,17 +161,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v3, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -246,19 +246,19 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v3, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -342,17 +342,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -424,16 +424,15 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -497,15 +496,14 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -569,14 +567,13 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -634,12 +631,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -703,13 +700,13 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -775,13 +772,13 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0xe400, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0xe400, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -845,15 +842,14 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -918,13 +914,13 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -989,15 +985,15 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1073,16 +1069,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v6, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1157,16 +1153,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1244,16 +1240,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v6, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1329,16 +1325,15 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1399,14 +1394,14 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_negk_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1468,14 +1463,14 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_fneg_inv2pi_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xb118, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xb118, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1537,14 +1532,14 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-LABEL: add_select_fneg_neginv2pi_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3118, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0x3118, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1602,12 +1597,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1666,12 +1661,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xe800 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xec00, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xec00, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1728,12 +1723,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1794,14 +1789,14 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_negk_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0x3c00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1862,14 +1857,14 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_fneg_posk_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -1930,14 +1925,14 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX11-LABEL: add_select_posk_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, 0xbc00, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2012,16 +2007,16 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2096,18 +2091,18 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_fabs_negfabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2184,16 +2179,16 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2267,18 +2262,18 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX11-LABEL: add_select_fabs_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX11-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v2, v3, v2, s0 -; GFX11-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2347,16 +2342,15 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, v3, v2, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2425,17 +2419,16 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-LABEL: add_select_negfabs_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v0, v5, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v1, v2, v3, s0 -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2505,15 +2498,14 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0x4400, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2583,13 +2575,13 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0x4400, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x4400, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2658,15 +2650,14 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0xc400, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2736,13 +2727,13 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0xc400, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0xc400, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer @@ -2818,16 +2809,15 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: @@ -2878,14 +2868,14 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fadd <2 x half> %x, @@ -2955,16 +2945,15 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: @@ -3015,14 +3004,14 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fsub <2 x half> %x, @@ -3080,14 +3069,14 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %mul = fmul <2 x half> %x, @@ -3163,16 +3152,15 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: @@ -3209,14 +3197,14 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) @@ -3294,16 +3282,15 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-SAFE-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: @@ -3361,14 +3348,14 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b16 v1, 0x4000, v2, s0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b16 v0, 0x4000, v0, vcc_lo -; GFX11-NSZ-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 5111870da5a63..50a3336a7483c 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -534,7 +534,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select i1 %cmp, half %a, half %b @@ -567,7 +567,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select nnan i1 %cmp, half %a, half %b @@ -600,7 +600,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select nsz i1 %cmp, half %a, half %b @@ -664,7 +664,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select i1 %cmp, half %a, half %b @@ -697,7 +697,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select nnan i1 %cmp, half %a, half %b @@ -730,7 +730,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b %val = select nsz i1 %cmp, half %a, half %b @@ -806,12 +806,11 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -857,12 +856,11 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -908,12 +906,11 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -995,12 +992,11 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1046,12 +1042,11 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1097,12 +1092,11 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_cndmask_b16 v0, v1, v0, s0 -; GFX12-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1199,23 +1193,22 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1275,23 +1268,22 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1351,23 +1343,22 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_ngt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_ngt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1475,23 +1466,22 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1551,23 +1541,22 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1627,23 +1616,22 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2 -; GFX12-NEXT: v_cmp_nlt_f16_e64 s1, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cmp_nlt_f16_e64 s2, v7, v6 -; GFX12-NEXT: v_cndmask_b16 v1, v3, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b16 v0, v2, v0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b16 v2, v4, v5, s1 -; GFX12-NEXT: v_cndmask_b16 v3, v6, v7, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <4 x half> %a, %b %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 28b0d2cfba731..572026da79646 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -101,12 +101,12 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: s_mov_b32 s17, s11 ; GFX11-NEXT: s_mov_b32 s20, s12 ; GFX11-NEXT: s_mov_b32 s21, s13 +; GFX11-NEXT: s_mov_b32 s24, s14 +; GFX11-NEXT: s_mov_b32 s25, s15 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s24, s14 -; GFX11-NEXT: s_mov_b32 s25, s15 ; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc @@ -114,7 +114,7 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: s_mov_b32 s4, s8 ; GFX11-NEXT: s_mov_b32 s5, s9 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, v3, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm @@ -210,25 +210,25 @@ define amdgpu_kernel void @select_f16_imm_a( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -321,25 +321,25 @@ define amdgpu_kernel void @select_f16_imm_b( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b16 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -433,25 +433,25 @@ define amdgpu_kernel void @select_f16_imm_c( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 ; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -545,25 +545,25 @@ define amdgpu_kernel void @select_f16_imm_d( ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s18, s10 ; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: s_mov_b32 s16, s4 ; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b16 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -686,12 +686,12 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, s2 +; GFX11-NEXT: s_mov_b32 s7, s3 ; GFX11-NEXT: s_mov_b32 s22, s2 ; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_mov_b32 s18, s2 ; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s6, s2 -; GFX11-NEXT: s_mov_b32 s7, s3 ; GFX11-NEXT: s_mov_b32 s26, s2 ; GFX11-NEXT: s_mov_b32 s27, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -699,30 +699,28 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s21, s13 ; GFX11-NEXT: s_mov_b32 s16, s10 ; GFX11-NEXT: s_mov_b32 s17, s11 -; GFX11-NEXT: buffer_load_b32 v0, off, s[20:23], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: s_mov_b32 s24, s14 ; GFX11-NEXT: s_mov_b32 s25, s15 -; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-NEXT: s_mov_b32 s0, s8 ; GFX11-NEXT: s_mov_b32 s1, s9 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: v_cmp_lt_f16_e64 s0, v5, v4 -; GFX11-NEXT: v_cndmask_b16 v2, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-NEXT: s_mov_b32 s0, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -829,42 +827,42 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s17, s9 -; GFX11-NEXT: s_mov_b32 s20, s10 -; GFX11-NEXT: s_mov_b32 s21, s11 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3 -; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, s0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -968,42 +966,42 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s17, s9 -; GFX11-NEXT: s_mov_b32 s20, s10 -; GFX11-NEXT: s_mov_b32 s21, s11 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3 -; GFX11-NEXT: v_cndmask_b16 v1, v2, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, s0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1109,42 +1107,42 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s8 -; GFX11-NEXT: s_mov_b32 s17, s9 -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s20, s10 -; GFX11-NEXT: s_mov_b32 s21, s11 -; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s0, v4, v3 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1250,42 +1248,42 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s8 -; GFX11-NEXT: s_mov_b32 s17, s9 -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s20, s10 -; GFX11-NEXT: s_mov_b32 s21, s11 -; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_lt_f16_e64 s0, v4, v3 -; GFX11-NEXT: v_cndmask_b16 v1, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, 0x3900, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1405,23 +1403,22 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; GFX11-LABEL: v_vselect_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX11-NEXT: v_cndmask_b16 v4, v7, v5, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b16 v5, v9, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v0, v2, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b16 v1, v3, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <4 x i32> %cond, zeroinitializer %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -1599,36 +1596,37 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; GFX11-LABEL: v_vselect_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v10 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v9 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v11 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v14 -; GFX11-NEXT: v_cndmask_b16 v8, v9, v8, s5 -; GFX11-NEXT: v_cndmask_b16 v9, v11, v10, s4 -; GFX11-NEXT: v_cndmask_b16 v10, v13, v12, s3 -; GFX11-NEXT: v_cndmask_b16 v11, v16, v15, s2 -; GFX11-NEXT: v_cndmask_b16 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v5, v1, s0 -; GFX11-NEXT: v_cndmask_b16 v2, v6, v2, s1 -; GFX11-NEXT: v_cndmask_b16 v3, v7, v3, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v0, v11, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v9, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v8, v3, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <8 x i32> %cond, zeroinitializer %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b @@ -1992,64 +1990,67 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v20 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v22 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v24 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v26 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 -; GFX11-NEXT: v_cmp_eq_u32_e64 s7, 0, v17 -; GFX11-NEXT: v_cmp_eq_u32_e64 s8, 0, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 0, v21 -; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 0, v23 -; GFX11-NEXT: v_cmp_eq_u32_e64 s11, 0, v25 -; GFX11-NEXT: v_cmp_eq_u32_e64 s12, 0, v27 -; GFX11-NEXT: v_cmp_eq_u32_e64 s13, 0, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_cndmask_b16 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v18, v19, v18, s13 -; GFX11-NEXT: v_cndmask_b16 v19, v21, v20, s12 -; GFX11-NEXT: v_cndmask_b16 v20, v23, v22, s11 -; GFX11-NEXT: v_cndmask_b16 v21, v25, v24, s10 -; GFX11-NEXT: v_cndmask_b16 v22, v27, v26, s9 -; GFX11-NEXT: v_cndmask_b16 v23, v29, v28, s8 -; GFX11-NEXT: v_cndmask_b16 v24, v32, v30, s7 -; GFX11-NEXT: v_cndmask_b16 v7, v15, v7, s6 -; GFX11-NEXT: v_cndmask_b16 v6, v14, v6, s5 -; GFX11-NEXT: v_cndmask_b16 v5, v13, v5, s4 -; GFX11-NEXT: v_cndmask_b16 v4, v12, v4, s3 -; GFX11-NEXT: v_cndmask_b16 v1, v9, v1, s0 -; GFX11-NEXT: v_cndmask_b16 v2, v10, v2, s1 -; GFX11-NEXT: v_cndmask_b16 v3, v11, v3, s2 -; GFX11-NEXT: v_perm_b32 v0, v24, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v20, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v23, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v22, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v21, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v19, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v18, v6, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v4, v10, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 -; GFX11-NEXT: v_cndmask_b16 v8, v17, v16, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v12, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v11, v7, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <16 x i32> %cond, zeroinitializer %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b @@ -2921,40 +2922,39 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:96 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v30 ; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v13 @@ -2982,123 +2982,131 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v18 ; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 ; GFX11-NEXT: s_waitcnt vmcnt(32) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(30) -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v34 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(28) -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v35 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v36 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(26) -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 0, v37 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v38 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(24) -; GFX11-NEXT: v_cmp_eq_u32_e64 s7, 0, v39 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_cmp_eq_u32_e64 s8, 0, v48 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(22) -; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 0, v49 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 0, v50 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50 +; GFX11-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(20) -; GFX11-NEXT: v_cmp_eq_u32_e64 s11, 0, v51 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51 +; GFX11-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_cmp_eq_u32_e64 s12, 0, v52 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52 +; GFX11-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(18) -; GFX11-NEXT: v_cmp_eq_u32_e64 s13, 0, v53 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_cmp_eq_u32_e64 s14, 0, v54 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(16) -; GFX11-NEXT: v_cmp_eq_u32_e64 s15, 0, v55 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_cmp_eq_u32_e64 s16, 0, v64 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(14) -; GFX11-NEXT: v_cmp_eq_u32_e64 s17, 0, v65 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_cmp_eq_u32_e64 s18, 0, v66 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(12) -; GFX11-NEXT: v_cmp_eq_u32_e64 s19, 0, v67 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_cmp_eq_u32_e64 s20, 0, v68 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_cmp_eq_u32_e64 s21, 0, v69 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 +; GFX11-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_cmp_eq_u32_e64 s22, 0, v70 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_cmp_eq_u32_e64 s23, 0, v71 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 +; GFX11-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_cmp_eq_u32_e64 s24, 0, v80 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_cmp_eq_u32_e64 s25, 0, v81 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 +; GFX11-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cmp_eq_u32_e64 s26, 0, v82 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v83 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 +; GFX11-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_eq_u32_e64 s27, 0, v84 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cmp_eq_u32_e64 s28, 0, v85 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_eq_u32_e64 s29, 0, v86 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_hi, 0, v87 -; GFX11-NEXT: v_cndmask_b16 v34, v34, v96, s26 -; GFX11-NEXT: v_cndmask_b16 v35, v98, v97, s27 -; GFX11-NEXT: v_cndmask_b16 v36, v100, v99, s28 -; GFX11-NEXT: v_cndmask_b16 v37, v102, v101, s29 -; GFX11-NEXT: v_cndmask_b16 v38, v112, v103, vcc_hi -; GFX11-NEXT: v_cndmask_b16 v39, v114, v113, s25 -; GFX11-NEXT: v_cndmask_b16 v48, v116, v115, s24 -; GFX11-NEXT: v_cndmask_b16 v49, v118, v117, s23 -; GFX11-NEXT: v_cndmask_b16 v50, v128, v119, s22 -; GFX11-NEXT: v_cndmask_b16 v51, v130, v129, s21 -; GFX11-NEXT: v_cndmask_b16 v52, v132, v131, s20 -; GFX11-NEXT: v_cndmask_b16 v53, v134, v133, s19 -; GFX11-NEXT: v_cndmask_b16 v54, v144, v135, s18 -; GFX11-NEXT: v_cndmask_b16 v55, v146, v145, s17 -; GFX11-NEXT: v_cndmask_b16 v31, v31, v147, s16 -; GFX11-NEXT: v_cndmask_b16 v32, v33, v32, s15 -; GFX11-NEXT: v_cndmask_b16 v15, v83, v15, s14 -; GFX11-NEXT: v_cndmask_b16 v14, v30, v14, s13 -; GFX11-NEXT: v_cndmask_b16 v13, v29, v13, s12 -; GFX11-NEXT: v_cndmask_b16 v12, v28, v12, s11 -; GFX11-NEXT: v_cndmask_b16 v11, v27, v11, s10 -; GFX11-NEXT: v_cndmask_b16 v10, v26, v10, s9 -; GFX11-NEXT: v_cndmask_b16 v9, v25, v9, s8 -; GFX11-NEXT: v_cndmask_b16 v8, v24, v8, s7 -; GFX11-NEXT: v_cndmask_b16 v7, v23, v7, s6 -; GFX11-NEXT: v_cndmask_b16 v6, v22, v6, s5 -; GFX11-NEXT: v_cndmask_b16 v5, v21, v5, s4 -; GFX11-NEXT: v_cndmask_b16 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b16 v1, v17, v1, s0 -; GFX11-NEXT: v_cndmask_b16 v2, v18, v2, s1 -; GFX11-NEXT: v_cndmask_b16 v3, v19, v3, s2 -; GFX11-NEXT: v_cndmask_b16 v4, v20, v4, s3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 +; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 +; GFX11-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo ; GFX11-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v54, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v53, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v52, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v51, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v49, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <32 x i32> %cond, zeroinitializer %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index 13e1da4a96c8d..c6cc479b5deb1 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -2242,12 +2242,12 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b16 v0, v0, v1, s[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -2263,12 +2263,12 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 -; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX12-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b16 v0, v0, v1, s[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1