Skip to content

Commit f44daa1

Browse files
committed
[AMDGPU] bf16 clamp folding
1 parent 9f102a9 commit f44daa1

File tree

3 files changed

+22
-21
lines changed

3 files changed

+22
-21
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
20812081
case AMDGPU::V_MAX_F16_fake16_e64:
20822082
case AMDGPU::V_MAX_F64_e64:
20832083
case AMDGPU::V_MAX_NUM_F64_e64:
2084-
case AMDGPU::V_PK_MAX_F16: {
2084+
case AMDGPU::V_PK_MAX_F16:
2085+
case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2086+
case AMDGPU::V_PK_MAX_NUM_BF16: {
20852087
if (MI.mayRaiseFPException())
20862088
return nullptr;
20872089

@@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
21082110

21092111
// Having a 0 op_sel_hi would require swizzling the output in the source
21102112
// instruction, which we can't do.
2111-
unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
2112-
: 0u;
2113+
unsigned UnsetMods =
2114+
(Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2115+
? SISrcMods::OP_SEL_1
2116+
: 0u;
21132117
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
21142118
return nullptr;
21152119
return Src0;

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -368,10 +368,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
368368
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
369369
; GCN-LABEL: test_clamp_bf16_folding:
370370
; GCN: ; %bb.0:
371-
; GCN-NEXT: v_exp_bf16_e32 v0, v0
372-
; GCN-NEXT: v_nop
373-
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
374-
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
371+
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
375372
; GCN-NEXT: ; return to shader part epilog
376373
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
377374
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
@@ -382,9 +379,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
382379
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
383380
; GCN-LABEL: test_clamp_v2bf16_folding:
384381
; GCN: ; %bb.0:
385-
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
386-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
387-
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
382+
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
388383
; GCN-NEXT: ; return to shader part epilog
389384
%mul = fmul <2 x bfloat> %src0, %src1
390385
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)

llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,11 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
7474
; GFX1250: ; %bb.0:
7575
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
7676
; GFX1250-NEXT: s_wait_kmcnt 0x0
77-
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
78-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
79-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
77+
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
8078
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
79+
80+
81+
8182
%src0.ext = fpext bfloat %src0 to float
8283
%src1.ext = fpext bfloat %src1 to float
8384
%result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
@@ -191,10 +192,11 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
191192
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
192193
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
193194
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
194-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
195-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
196-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
195+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
197196
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
197+
198+
199+
198200
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
199201
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
200202
%src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
@@ -247,12 +249,12 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
247249
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
248250
; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
249251
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
250-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
251-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
252-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
253-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
254-
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
252+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
253+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 clamp
255254
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
255+
256+
257+
256258
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
257259
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
258260
%src2.ext = fpext <4 x bfloat> %src2 to <4 x float>

0 commit comments

Comments
 (0)