Skip to content

Commit 8d01bdd

Browse files
changpengrampitec
authored andcommitted
[AMDGPU] Support AMDGPUClamp for bf16 on gfx1250 (llvm#150663)
Scalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits. FIXME-TRUE16: enable bf16 clamp with true16 --------- Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent 9b8581b commit 8d01bdd

File tree

8 files changed

+192
-67
lines changed

8 files changed

+192
-67
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
1417914179
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
1418014180
(VT == MVT::f32 || VT == MVT::f64 ||
1418114181
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14182+
(VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14183+
(VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
1418214184
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
1418314185
Op0.hasOneUse()) {
1418414186
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25082508
.addReg(DstHi);
25092509
}
25102510
break;
2511+
2512+
case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2513+
assert(ST.hasBF16PackedInsts());
2514+
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2515+
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2516+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2517+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2518+
auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2519+
Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2520+
auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2521+
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2522+
break;
25112523
}
2524+
25122525
return true;
25132526
}
25142527

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
28652865
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
28662866
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
28672867
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
2868+
def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
28682869

28692870
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
28702871
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
18941894
def : ClampPat<V_MAX_F16_t16_e64, f16>;
18951895
let SubtargetPredicate = UseFakeTrue16Insts in
18961896
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
1897+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1898+
let True16Predicate = UseFakeTrue16Insts in
1899+
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
18971900

18981901
let SubtargetPredicate = HasVOP3PInsts in {
18991902
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
19031906
>;
19041907
}
19051908

1909+
let SubtargetPredicate = HasBF16PackedInsts in {
1910+
def : GCNPat <
1911+
(v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
1912+
(V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
1913+
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
1914+
>;
1915+
} // End SubtargetPredicate = HasBF16PackedInsts
19061916

19071917
/********** ================================ **********/
19081918
/********** Floating point absolute/negative **********/

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
12361236
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
12371237
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
12381238
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
1239+
1240+
// Scalar pseudo used to emulate AMDGPUClamp.
1241+
// Expanded to V_PK_MAX_NUM_BF16 with unused high half.
1242+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1243+
let True16Predicate = UseFakeTrue16Insts in
1244+
defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
12391245
}
12401246
} // End isCommutable = 1, isReMaterializable = 1
12411247

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
323323
ret void
324324
}
325325

326+
define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
327+
; GCN-LABEL: test_clamp_bf16:
328+
; GCN: ; %bb.0:
329+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
330+
; GCN-NEXT: ; return to shader part epilog
331+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
332+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
333+
ret bfloat %clamp
334+
}
335+
336+
define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
337+
; GCN-LABEL: test_clamp_bf16_s:
338+
; GCN: ; %bb.0:
339+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
340+
; GCN-NEXT: ; return to shader part epilog
341+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
342+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
343+
ret bfloat %clamp
344+
}
345+
346+
define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
347+
; GCN-LABEL: test_clamp_v2bf16:
348+
; GCN: ; %bb.0:
349+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
350+
; GCN-NEXT: ; return to shader part epilog
351+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
352+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
353+
%ret = bitcast <2 x bfloat> %clamp to float
354+
ret float %ret
355+
}
356+
357+
define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
358+
; GCN-LABEL: test_clamp_v2bf16_s:
359+
; GCN: ; %bb.0:
360+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
361+
; GCN-NEXT: ; return to shader part epilog
362+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
363+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
364+
%ret = bitcast <2 x bfloat> %clamp to float
365+
ret float %ret
366+
}
367+
368+
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
369+
; GCN-LABEL: test_clamp_bf16_folding:
370+
; GCN: ; %bb.0:
371+
; GCN-NEXT: v_exp_bf16_e32 v0, v0
372+
; GCN-NEXT: v_nop
373+
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
374+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
375+
; GCN-NEXT: ; return to shader part epilog
376+
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
377+
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
378+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
379+
ret bfloat %clamp
380+
}
381+
382+
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
383+
; GCN-LABEL: test_clamp_v2bf16_folding:
384+
; GCN: ; %bb.0:
385+
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
386+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
387+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
388+
; GCN-NEXT: ; return to shader part epilog
389+
%mul = fmul <2 x bfloat> %src0, %src1
390+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
391+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
392+
%ret = bitcast <2 x bfloat> %clamp to float
393+
ret float %ret
394+
}
395+
396+
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
397+
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
398+
; GCN: ; %bb.0:
399+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
400+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
401+
; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
402+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
403+
; GCN-NEXT: s_endpgm
404+
%mul = fmul contract <2 x bfloat> %a, %b
405+
%add = fadd contract <2 x bfloat> %mul, %c
406+
store <2 x bfloat> %add, ptr addrspace(1) %out
407+
ret void
408+
}
409+
410+
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
411+
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
412+
; GCN: ; %bb.0:
413+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
414+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
415+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
416+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
417+
; GCN-NEXT: s_endpgm
418+
%mul = fmul contract <2 x bfloat> %a, %b
419+
%add = fadd contract <2 x bfloat> %mul, %c
420+
store <2 x bfloat> %add, ptr addrspace(1) %out
421+
ret void
422+
}
423+
424+
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
425+
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
426+
; GCN: ; %bb.0:
427+
; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
428+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
429+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
430+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
431+
; GCN-NEXT: s_endpgm
432+
%mul = fmul contract <2 x bfloat> %a, %b
433+
%add = fadd contract <2 x bfloat> %mul, %c
434+
store <2 x bfloat> %add, ptr addrspace(1) %out
435+
ret void
436+
}
437+
438+
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
439+
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
440+
; GCN: ; %bb.0:
441+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
442+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
443+
; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
444+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
445+
; GCN-NEXT: s_endpgm
446+
%mul = fmul contract <2 x bfloat> %a, %b
447+
%add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
448+
store <2 x bfloat> %add, ptr addrspace(1) %out
449+
ret void
450+
}
451+
452+
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
453+
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
454+
; GCN: ; %bb.0:
455+
; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
456+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
457+
; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
458+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
459+
; GCN-NEXT: s_endpgm
460+
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
461+
%add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
462+
store <2 x bfloat> %add, ptr addrspace(1) %out
463+
ret void
464+
}
465+
326466
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
327467
; GCN-LABEL: v_test_fma_v2bf16_vvv:
328468
; GCN: ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
426566
ret void
427567
}
428568

569+
declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
570+
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
429571
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
430572
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
431573
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)

llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,18 +121,7 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
121121
; GFX1250: ; %bb.0:
122122
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
123123
; GFX1250-NEXT: s_wait_kmcnt 0x0
124-
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
125-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
127-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
128-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
129-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
130-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
131-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
132-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
133-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
134-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
135-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
124+
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
136125
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
137126
%src0.ext = fpext bfloat %src0 to float
138127
%src1.ext = fpext bfloat %src1 to float
@@ -150,20 +139,10 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
150139
; GFX1250: ; %bb.0:
151140
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
152141
; GFX1250-NEXT: s_wait_kmcnt 0x0
153-
; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
154-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
155-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
156-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
157-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
158-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
159-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
160-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
161-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
162-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
163-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
164-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
142+
; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
143+
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
165144
; GFX1250-NEXT: s_wait_storecnt 0x0
166-
; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
145+
; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
167146
; GFX1250-NEXT: s_wait_storecnt 0x0
168147
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
169148
%src0.ext = fpext bfloat %src0 to float

llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,8 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
7575
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
7676
; GFX1250-NEXT: s_wait_kmcnt 0x0
7777
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
78-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
79-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
80-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
81-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
82-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
83-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
84-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
85-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
86-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
78+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
79+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
8780
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
8881
%src0.ext = fpext bfloat %src0 to float
8982
%src1.ext = fpext bfloat %src1 to float
@@ -199,9 +192,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
199192
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
200193
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
201194
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
202-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
203-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
204-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
195+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
196+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
205197
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
206198
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
207199
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
@@ -219,16 +211,13 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bflo
219211
; GFX1250: ; %bb.0:
220212
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
221213
; GFX1250-NEXT: s_wait_kmcnt 0x0
222-
; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
223-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
224-
; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
225-
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
226-
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0
214+
; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
215+
; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
227216
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
228-
; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0
229-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
217+
; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
218+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
230219
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
231-
; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0
220+
; GFX1250-NEXT: v_mov_b32_e32 v0, v6
232221
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
233222
%src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
234223
%src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
@@ -261,11 +250,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
261250
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
262251
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
263252
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
264-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
265-
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0
266-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
267-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
268-
; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
253+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
254+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
269255
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
270256
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
271257
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
@@ -291,15 +277,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b
291277
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
292278
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
293279
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
294-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
295-
; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
296-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
297-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
298-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
299-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
300-
; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
301-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
302-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
280+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp
303281
; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
304282
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
305283
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
@@ -328,14 +306,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b
328306
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
329307
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
330308
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
331-
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
332-
; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
333-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
334-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
335-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
336-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
337-
; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
338-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
309+
; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0
310+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
339311
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
340312
; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
341313
; GFX1250-NEXT: s_set_pc_i64 s[30:31]

0 commit comments

Comments
 (0)