Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
(VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2509,6 +2509,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
break;
}

case AMDGPU::V_MAX_BF16_PSEUDO_e64:
assert(ST.hasBF16PackedInsts());
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
break;
}

return true;
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;

def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
let True16Predicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;

let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
Expand All @@ -1903,6 +1906,13 @@ def : GCNPat <
>;
}

let SubtargetPredicate = HasBF16PackedInsts in {
def : GCNPat <
(v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
(V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
>;
} // End SubtargetPredicate = HasBF16PackedInsts

/********** ================================ **********/
/********** Floating point absolute/negative **********/
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;

// Scalar pseudo used to emulate AMDGPUClamp.
// Expanded to V_PK_MAX_NUM_BF16 with unused high half.
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
let True16Predicate = UseFakeTrue16Insts in
defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
}
} // End isCommutable = 1, isReMaterializable = 1

Expand Down
131 changes: 131 additions & 0 deletions llvm/test/CodeGen/AMDGPU/bf16-math.ll
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,135 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
ret void
}

define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
; GCN-LABEL: test_clamp_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
ret bfloat %clamp
}

define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
; GCN-LABEL: test_clamp_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
; GCN-NEXT: ; return to shader part epilog
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
ret bfloat %clamp
}

define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
; GCN-LABEL: test_clamp_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
%ret = bitcast <2 x bfloat> %clamp to float
ret float %ret
}

define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
; GCN-LABEL: test_clamp_v2bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
; GCN-NEXT: ; return to shader part epilog
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
%ret = bitcast <2 x bfloat> %clamp to float
ret float %ret
}

define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
; GCN-LABEL: test_clamp_bf16_folding:
; GCN: ; %bb.0:
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
ret bfloat %clamp
}

define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
; GCN-LABEL: test_clamp_v2bf16_folding:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
; GCN-NEXT: ; return to shader part epilog
%mul = fmul <2 x bfloat> %src0, %src1
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
%ret = bitcast <2 x bfloat> %clamp to float
ret float %ret
}

define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
%add = fadd contract <2 x bfloat> %mul, %c
store <2 x bfloat> %add, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
%add = fadd contract <2 x bfloat> %mul, %c
store <2 x bfloat> %add, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
%add = fadd contract <2 x bfloat> %mul, %c
store <2 x bfloat> %add, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
%add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
store <2 x bfloat> %add, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s0, 0x43484000
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
%add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
store <2 x bfloat> %add, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_fma_v2bf16_vvv:
; GCN: ; %bb.0:
Expand Down Expand Up @@ -426,6 +555,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}

declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
Expand Down
Loading