-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Support AMDGPUClamp for bf16 on gfx1250 #150663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Scalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits. FIXME-TRUE16: enable bf16 clamp with true16 Co-Authored-by: Stanislav Mekhanoshin <[email protected]>
|
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesScalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits. FIXME-TRUE16: enable bf16 clamp with true16 Full diff: https://github.com/llvm/llvm-project/pull/150663.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0eee7ad0cf923..8d51ec6dc7f31 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
+ (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d04aafba68ed3..cce2c7c48dadf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2509,6 +2509,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
break;
}
+
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ assert(ST.hasBF16PackedInsts());
+ MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
+ MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
+ auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+ Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
+ auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
+ break;
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b0be3f864b94d..83b049042f804 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
+def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f95c618..54fa192aeec92 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
+// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+let True16Predicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
>;
}
+let SubtargetPredicate = HasBF16PackedInsts in {
+def : GCNPat <
+ (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
+ (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
+ $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+} // End SubtargetPredicate = HasBF16PackedInsts
/********** ================================ **********/
/********** Floating point absolute/negative **********/
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c812dc9850514..95fcd4ac1c101 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+
+ // Scalar pseudo used to emulate AMDGPUClamp.
+ // Expanded to V_PK_MAX_NUM_BF16 with unused high half.
+ // FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
}
} // End isCommutable = 1, isReMaterializable = 1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 1adf5423356a2..30a78648c186a 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -323,6 +323,135 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
ret void
}
+define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
+; GCN-LABEL: test_clamp_bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
+; GCN-LABEL: test_clamp_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
+; GCN-LABEL: test_clamp_v2bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
+; GCN-LABEL: test_clamp_v2bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %mul = fmul <2 x bfloat> %src0, %src1
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vll:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s0, 0x43484000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_fma_v2bf16_vvv:
; GCN: ; %bb.0:
@@ -426,6 +555,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Scalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits. FIXME-TRUE16: enable bf16 clamp with true16 --------- Co-authored-by: Stanislav Mekhanoshin <[email protected]>
Scalar version uses V_MAX_BF16_PSEUDO which is expanded to V_PK_MAX_BF16 with unused high bits. If V_PK_MAX_BF16 is produced directly instead that creates problem with folding of the clamp into other scalar instructions due to incompatible clamp bits.
FIXME-TRUE16: enable bf16 clamp with true16