diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b08b6b46fc52c..450c4e12ab02f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -861,9 +861,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasIEEEMinMax()) { setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); - setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, - Custom); } else { // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum if (Subtarget->hasMinimum3Maximum3F32()) @@ -878,6 +875,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + if (Subtarget->hasVOP3PInsts()) { + // We want to break these into v2f16 pieces, not scalarize. + setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Custom); + } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll index 603e04fc7a7a7..3774c6c0cbbee 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll @@ -11,19 +11,19 @@ define void @maximum_f16() { ; GFX950-FASTF64-LABEL: 'maximum_f16' ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef) ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-LABEL: 'maximum_f16' ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef) ; GFX9-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'maximum_f16' @@ -38,10 +38,10 @@ define void @maximum_f16() { ; GFX9-SIZE-LABEL: 'maximum_f16' ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef) ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'maximum_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll index 4507ba4929f1b..24b9549dfe3a4 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll @@ -11,19 +11,19 @@ define void @minimum_f16() { ; GFX950-FASTF64-LABEL: 'minimum_f16' ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef) ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-LABEL: 'minimum_f16' ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef) ; GFX9-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX9-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'minimum_f16' @@ -38,10 +38,10 @@ define void @minimum_f16() { ; GFX9-SIZE-LABEL: 'minimum_f16' ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef) ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'minimum_f16' diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 567202be69fa6..53d940e1e6c1a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2375,21 +2375,21 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v5, v1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 @@ -2437,21 +2437,21 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 @@ -2500,40 +2500,40 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v9, v1, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v10 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX942-NEXT: v_perm_b32 v2, v7, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all: @@ -2582,21 +2582,21 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 @@ -2643,22 +2643,21 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 +; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_movk_i32 s0, 0x7e00 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc @@ -2705,21 +2704,21 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v7, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 @@ -2765,34 +2764,34 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16: @@ -2830,34 +2829,34 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16_commute: @@ -2898,42 +2897,40 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 ; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_perm_b32 v2, v9, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v10 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: v_perm_b32 v3, v7, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v11 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v12, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2981,34 +2978,34 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: @@ -3046,37 +3043,34 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_perm_b32 v7, v8, v1, s0 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 -; GFX942-NEXT: v_pk_max_f16 v6, v6, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v8, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: @@ -3114,34 +3108,34 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v7, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v3, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 81b8e8ebd10e3..d1d0c0dcdb7e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2375,21 +2375,21 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v5, v1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 @@ -2437,21 +2437,21 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 @@ -2500,40 +2500,40 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v9, v1, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v10 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX942-NEXT: v_perm_b32 v2, v7, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__fabs_all: @@ -2582,21 +2582,21 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 @@ -2643,22 +2643,21 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 +; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_movk_i32 s0, 0x7e00 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc @@ -2705,21 +2704,21 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v7, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 @@ -2765,34 +2764,34 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16: @@ -2830,34 +2829,34 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16_commute: @@ -2898,42 +2897,40 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 ; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_perm_b32 v2, v9, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v10 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: v_perm_b32 v3, v7, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v11 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v12, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2981,34 +2978,34 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: @@ -3046,37 +3043,34 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_perm_b32 v7, v8, v1, s0 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 -; GFX942-NEXT: v_pk_min_f16 v6, v6, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v8, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: @@ -3114,34 +3108,34 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v3, v1, v7, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v3, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll index 389df695ba324..41fad10051dac 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll @@ -431,131 +431,122 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) { ; GFX9-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f16_e32 v4, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v2, v6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX9-NEXT: v_max_f16_e32 v6, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_max_f16_e32 v4, v0, v6 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v5, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v4, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v5, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v3, v0, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half: @@ -738,180 +729,174 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX9-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_max_f16 v8, v2, v6 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f16_e32 v8, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc +; GFX9-NEXT: v_pk_max_f16 v8, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX9-NEXT: v_perm_b32 v6, v2, v10, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v4, v0, v11, s0 +; GFX9-NEXT: v_pk_max_f16 v4, v4, v6 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v11, v10 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc +; GFX9-NEXT: v_max_f16_e32 v2, v6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0 +; GFX9-NEXT: v_pk_max_f16 v6, v1, v5 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; GFX9-NEXT: v_pk_max_f16 v2, v3, v7 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX9-NEXT: v_perm_b32 v3, v2, v4, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX9-NEXT: v_perm_b32 v5, v1, v7, s0 +; GFX9-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v6 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v3, vcc +; GFX9-NEXT: v_max_f16_e32 v5, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v7 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_max_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v7 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v9, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v8, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v4 +; GFX10-NEXT: v_pk_max_f16 v8, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX10-NEXT: v_pk_max_f16 v9, v0, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v8, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v5 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v6 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v6 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v0, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_max_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v7 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v9, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v9, v3, v7 +; GFX10-NEXT: v_perm_b32 v4, v2, v8, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX10-NEXT: v_pk_max_f16 v11, v1, v5 +; GFX10-NEXT: v_perm_b32 v10, v0, v6, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v12, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v11, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_perm_b32 v6, v3, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v1, v7, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v8, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v9 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_max_f16_e32 v4, v0, v6 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v8, v2, v6 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v6.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v0, v4 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v0.h, v4.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v2.l, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v2.h, s2 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v3.h, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v1.h, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v4, v2, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.h, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v3, v7 +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v1, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v3.l, s2 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v3.h, s3 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v5.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v2, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v6.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -919,74 +904,70 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v9, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v8, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v9 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v4 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v8, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v10, v0, v4 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v8, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v2, v9, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v10, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v12, v1, v5 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v13, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v8, v4, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v6, v8, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v14, v13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v15, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v2 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v4, v3, v8, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v5, v1, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v6, v4 +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v7, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v8 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v3 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll index 2f628b7cdb281..61819a85dd82c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll @@ -509,131 +509,122 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) { ; GFX9-LABEL: test_vector_reduce_fminimum_v8half: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f16_e32 v4, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v2, v6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX9-NEXT: v_min_f16_e32 v6, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_vector_reduce_fminimum_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v4, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_min_f16_e32 v5, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_min_f16_e32 v4, v0, v6 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v8half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v5, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v4, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v5, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v3, v0, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half: @@ -858,180 +849,174 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX9-LABEL: test_vector_reduce_fminimum_v16half: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_min_f16 v8, v2, v6 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f16_e32 v8, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc +; GFX9-NEXT: v_pk_min_f16 v8, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX9-NEXT: v_perm_b32 v6, v2, v10, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v4, v0, v11, s0 +; GFX9-NEXT: v_pk_min_f16 v4, v4, v6 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v11, v10 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc +; GFX9-NEXT: v_min_f16_e32 v2, v6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0 +; GFX9-NEXT: v_pk_min_f16 v6, v1, v5 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; GFX9-NEXT: v_pk_min_f16 v2, v3, v7 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX9-NEXT: v_perm_b32 v3, v2, v4, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX9-NEXT: v_perm_b32 v5, v1, v7, s0 +; GFX9-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v6 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v3, vcc +; GFX9-NEXT: v_min_f16_e32 v5, v0, v4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v7 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_min_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v7 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v9, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_vector_reduce_fminimum_v16half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v8, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v4 +; GFX10-NEXT: v_pk_min_f16 v8, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX10-NEXT: v_pk_min_f16 v9, v0, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v8, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v5 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v6 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v6 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_e32 v1, v0, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_min_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v7 src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v9, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v9, v3, v7 +; GFX10-NEXT: v_perm_b32 v4, v2, v8, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX10-NEXT: v_pk_min_f16 v11, v1, v5 +; GFX10-NEXT: v_perm_b32 v10, v0, v6, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v12, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v11, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_perm_b32 v6, v3, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v1, v7, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v2, v8, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v9 +; GFX10-NEXT: v_min_f16_e32 v5, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_min_f16_e32 v4, v0, v6 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v8, v2, v6 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v6.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v0, v4 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v0.h, v4.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v2.l, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v2.h, s2 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v3.h, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v1.h, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v4, v2, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.h, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v3, v7 +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v3, v1, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v3.l, s2 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v3.h, s3 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v5.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v3, v2, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v6.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -1039,74 +1024,70 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v16half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v9, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v8, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v9 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v4 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v8, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v10, v0, v4 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v8, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v2, v9, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v10, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v12, v1, v5 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v13, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v8, v4, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v6, v8, v6 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v10, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v14, v13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v15, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v2 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v4, v3, v8, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v5, v1, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v6, v4 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v7, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v8 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v3 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half: