diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0c653b1b46d65..962c276bc2123 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F16_fake16_e64: case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: - case AMDGPU::V_PK_MAX_F16: { + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + case AMDGPU::V_PK_MAX_NUM_BF16: { if (MI.mayRaiseFPException()) return nullptr; @@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 - : 0u; + unsigned UnsetMods = + (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16) + ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 9979e832b799a..682b3b4d57209 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -368,10 +368,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_exp_bf16_e32 v0, v0 -; GCN-NEXT: v_nop -; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %exp = call bfloat @llvm.exp2.bf16(bfloat %src) %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) @@ -382,9 +379,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) { ; GCN-LABEL: test_clamp_v2bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp ; GCN-NEXT: ; return to shader part epilog %mul = fmul <2 x bfloat> %src0, %src1 %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> ) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll index 1b2eb83ba1726..439317269b724 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll @@ -74,10 +74,11 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] + + + %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -191,10 +192,11 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] + + + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> @@ -247,12 +249,12 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo ; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3] ; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 clamp ; GFX1250-NEXT: s_set_pc_i64 s[30:31] + + + %src0.ext = fpext <4 x bfloat> %src0 to <4 x float> %src1.ext = fpext <4 x bfloat> %src1 to <4 x float> %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>