Skip to content

Commit 38382ca

Browse files
changpengrampitec
andcommitted
[AMDGPU] Support AMDGPUClamp for bf16 on gfx1250
Scalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits. FIXME-TRUE16: enable bf16 clamp with true16 Co-Authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent 2571924 commit 38382ca

File tree

6 files changed

+164
-0
lines changed

6 files changed

+164
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
1417914179
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
1418014180
(VT == MVT::f32 || VT == MVT::f64 ||
1418114181
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14182+
(VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14183+
(VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
1418214184
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
1418314185
Op0.hasOneUse()) {
1418414186
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2509,6 +2509,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25092509
}
25102510
break;
25112511
}
2512+
2513+
case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2514+
assert(ST.hasBF16PackedInsts());
2515+
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2516+
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2517+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2518+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2519+
auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2520+
Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2521+
auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2522+
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2523+
break;
2524+
}
2525+
25122526
return true;
25132527
}
25142528

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
28652865
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
28662866
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
28672867
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
2868+
def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
28682869

28692870
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
28702871
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
18941894
def : ClampPat<V_MAX_F16_t16_e64, f16>;
18951895
let SubtargetPredicate = UseFakeTrue16Insts in
18961896
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
1897+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1898+
let True16Predicate = UseFakeTrue16Insts in
1899+
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
18971900

18981901
let SubtargetPredicate = HasVOP3PInsts in {
18991902
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
19031906
>;
19041907
}
19051908

1909+
let SubtargetPredicate = HasBF16PackedInsts in {
1910+
def : GCNPat <
1911+
(v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
1912+
(V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
1913+
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
1914+
>;
1915+
} // End SubtargetPredicate = HasBF16PackedInsts
19061916

19071917
/********** ================================ **********/
19081918
/********** Floating point absolute/negative **********/

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
12361236
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
12371237
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
12381238
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
1239+
1240+
// Scalar pseudo used to emulate AMDGPUClamp.
1241+
// Expanded to V_PK_MAX_NUM_BF16 with unused high half.
1242+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1243+
let True16Predicate = UseFakeTrue16Insts in
1244+
defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
12391245
}
12401246
} // End isCommutable = 1, isReMaterializable = 1
12411247

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,135 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
323323
ret void
324324
}
325325

326+
define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
327+
; GCN-LABEL: test_clamp_bf16:
328+
; GCN: ; %bb.0:
329+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
330+
; GCN-NEXT: ; return to shader part epilog
331+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
332+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
333+
ret bfloat %clamp
334+
}
335+
336+
define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
337+
; GCN-LABEL: test_clamp_bf16_s:
338+
; GCN: ; %bb.0:
339+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
340+
; GCN-NEXT: ; return to shader part epilog
341+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
342+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
343+
ret bfloat %clamp
344+
}
345+
346+
define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
347+
; GCN-LABEL: test_clamp_v2bf16:
348+
; GCN: ; %bb.0:
349+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
350+
; GCN-NEXT: ; return to shader part epilog
351+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
352+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
353+
%ret = bitcast <2 x bfloat> %clamp to float
354+
ret float %ret
355+
}
356+
357+
define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
358+
; GCN-LABEL: test_clamp_v2bf16_s:
359+
; GCN: ; %bb.0:
360+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
361+
; GCN-NEXT: ; return to shader part epilog
362+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
363+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
364+
%ret = bitcast <2 x bfloat> %clamp to float
365+
ret float %ret
366+
}
367+
368+
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
369+
; GCN-LABEL: test_clamp_bf16_folding:
370+
; GCN: ; %bb.0:
371+
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
372+
; GCN-NEXT: ; return to shader part epilog
373+
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
374+
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
375+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
376+
ret bfloat %clamp
377+
}
378+
379+
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
380+
; GCN-LABEL: test_clamp_v2bf16_folding:
381+
; GCN: ; %bb.0:
382+
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
383+
; GCN-NEXT: ; return to shader part epilog
384+
%mul = fmul <2 x bfloat> %src0, %src1
385+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
386+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
387+
%ret = bitcast <2 x bfloat> %clamp to float
388+
ret float %ret
389+
}
390+
391+
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
392+
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
393+
; GCN: ; %bb.0:
394+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
395+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
396+
; GCN-NEXT: s_endpgm
397+
%mul = fmul contract <2 x bfloat> %a, %b
398+
%add = fadd contract <2 x bfloat> %mul, %c
399+
store <2 x bfloat> %add, ptr addrspace(1) %out
400+
ret void
401+
}
402+
403+
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
404+
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
405+
; GCN: ; %bb.0:
406+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
407+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
408+
; GCN-NEXT: s_endpgm
409+
%mul = fmul contract <2 x bfloat> %a, %b
410+
%add = fadd contract <2 x bfloat> %mul, %c
411+
store <2 x bfloat> %add, ptr addrspace(1) %out
412+
ret void
413+
}
414+
415+
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
416+
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
417+
; GCN: ; %bb.0:
418+
; GCN-NEXT: v_mov_b32_e32 v2, s2
419+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
420+
; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
421+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
422+
; GCN-NEXT: s_endpgm
423+
%mul = fmul contract <2 x bfloat> %a, %b
424+
%add = fadd contract <2 x bfloat> %mul, %c
425+
store <2 x bfloat> %add, ptr addrspace(1) %out
426+
ret void
427+
}
428+
429+
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
430+
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
431+
; GCN: ; %bb.0:
432+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
433+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
434+
; GCN-NEXT: s_endpgm
435+
%mul = fmul contract <2 x bfloat> %a, %b
436+
%add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
437+
store <2 x bfloat> %add, ptr addrspace(1) %out
438+
ret void
439+
}
440+
441+
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
442+
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
443+
; GCN: ; %bb.0:
444+
; GCN-NEXT: s_mov_b32 s0, 0x43484000
445+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
446+
; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
447+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
448+
; GCN-NEXT: s_endpgm
449+
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
450+
%add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
451+
store <2 x bfloat> %add, ptr addrspace(1) %out
452+
ret void
453+
}
454+
326455
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
327456
; GCN-LABEL: v_test_fma_v2bf16_vvv:
328457
; GCN: ; %bb.0:
@@ -426,6 +555,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
426555
ret void
427556
}
428557

558+
declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
559+
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
429560
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
430561
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
431562
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)

0 commit comments

Comments
 (0)