diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0eee7ad0cf923..8d51ec6dc7f31 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d04aafba68ed3..8d6c1d0466024 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(DstHi); } break; + + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + assert(ST.hasBF16PackedInsts()); + MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); + MI.addOperand(MachineOperand::CreateImm(0)); // op_sel + MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo + MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi + auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1); + auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1); + break; } + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b0be3f864b94d..83b049042f804 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>; +def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f95c618..54fa192aeec92 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat; let SubtargetPredicate = UseFakeTrue16Insts in def : ClampPat; +// FIXME-TRUE16: Pseudo expansion of this won't work with True16. +let True16Predicate = UseFakeTrue16Insts in +def : ClampPat; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1903,6 +1906,13 @@ def : GCNPat < >; } +let SubtargetPredicate = HasBF16PackedInsts in { +def : GCNPat < + (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; +} // End SubtargetPredicate = HasBF16PackedInsts /********** ================================ **********/ /********** Floating point absolute/negative **********/ diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index c812dc9850514..95fcd4ac1c101 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile, fminnum_like>; defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile, fmaxnum_like>; defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile, any_fma>; + + // Scalar pseudo used to emulate AMDGPUClamp. + // Expanded to V_PK_MAX_NUM_BF16 with unused high half. + // FIXME-TRUE16: Pseudo expansion of this won't work with True16. + let True16Predicate = UseFakeTrue16Insts in + defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>; } } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 1adf5423356a2..9979e832b799a 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> ret void } +define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) { +; GCN-LABEL: test_clamp_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: ; return to shader part epilog + %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + ret bfloat %clamp +} + +define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) { +; GCN-LABEL: test_clamp_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp +; GCN-NEXT: ; return to shader part epilog + %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + ret bfloat %clamp +} + +define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) { +; GCN-LABEL: test_clamp_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: ; return to shader part epilog + %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> ) + %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> ) + %ret = bitcast <2 x bfloat> %clamp to float + ret float %ret +} + +define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { +; GCN-LABEL: test_clamp_v2bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp +; GCN-NEXT: ; return to shader part epilog + %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> ) + %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> ) + %ret = bitcast <2 x bfloat> %clamp to float + ret float %ret +} + +define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { +; GCN-LABEL: test_clamp_bf16_folding: +; GCN: ; %bb.0: +; GCN-NEXT: v_exp_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: ; return to shader part epilog + %exp = call bfloat @llvm.exp2.bf16(bfloat %src) + %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + ret bfloat %clamp +} + +define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GCN-LABEL: test_clamp_v2bf16_folding: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: ; return to shader part epilog + %mul = fmul <2 x bfloat> %src0, %src1 + %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> ) + %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> ) + %ret = bitcast <2 x bfloat> %clamp to float + ret float %ret +} + +define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { +; GCN-LABEL: v_test_mul_add_v2bf16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_add_bf16 v2, v2, v4 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %mul = fmul contract <2 x bfloat> %a, %b + %add = fadd contract <2 x bfloat> %mul, %c + store <2 x bfloat> %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { +; GCN-LABEL: v_test_mul_add_v2bf16_vss: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_add_bf16 v2, v2, s1 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %mul = fmul contract <2 x bfloat> %a, %b + %add = fadd contract <2 x bfloat> %mul, %c + store <2 x bfloat> %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { +; GCN-LABEL: v_test_mul_add_v2bf16_sss: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_add_bf16 v2, v2, s2 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %mul = fmul contract <2 x bfloat> %a, %b + %add = fadd contract <2 x bfloat> %mul, %c + store <2 x bfloat> %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { +; GCN-LABEL: v_test_mul_add_v2bf16_vsc: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0] +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %mul = fmul contract <2 x bfloat> %a, %b + %add = fadd contract <2 x bfloat> %mul, + store <2 x bfloat> %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) { +; GCN-LABEL: v_test_mul_add_v2bf16_vll: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %mul = fmul contract <2 x bfloat> %a, + %add = fadd contract <2 x bfloat> %mul, + store <2 x bfloat> %add, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; GCN-LABEL: v_test_fma_v2bf16_vvv: ; GCN: ; %bb.0: @@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src ret void } +declare bfloat @llvm.minnum.bf16(bfloat, bfloat) +declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll index 5b2de59f7d271..84123e6c1e521 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -121,18 +121,7 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float @@ -150,20 +139,10 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll index 557080a61041c..1b2eb83ba1726 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll @@ -75,15 +75,8 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float @@ -199,9 +192,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0 -; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> @@ -219,16 +211,13 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bflo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0 -; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0 +; GFX1250-NEXT: v_mov_b32_e32 v0, v6 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> @@ -261,11 +250,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0 -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0] -; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <4 x bfloat> %src0 to <4 x float> %src1.ext = fpext <4 x bfloat> %src1 to <4 x float> @@ -291,15 +277,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b ; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp ; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> @@ -328,14 +306,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b ; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX1250-NEXT: s_set_pc_i64 s[30:31]