Skip to content

Commit d7ec80c

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_tanh_bf16 on gfx1250 (#147425)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent 5a4586f commit d7ec80c

33 files changed

+922
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
668668
TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
669669
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
670670

671+
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
672+
671673
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
672674
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
673675
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
497497
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
498498
return Builder.CreateCall(F, { Src });
499499
}
500+
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
501+
return emitBuiltinWithOneOverloadedType<1>(*this, E,
502+
Intrinsic::amdgcn_tanh);
500503
case AMDGPU::BI__builtin_amdgcn_uicmp:
501504
case AMDGPU::BI__builtin_amdgcn_uicmpl:
502505
case AMDGPU::BI__builtin_amdgcn_sicmp:

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ void test_s_wait_tensorcnt() {
4242
__builtin_amdgcn_s_wait_tensorcnt(0);
4343
}
4444

45+
// CHECK-LABEL: @test_tanh_bf16(
46+
// CHECK-NEXT: entry:
47+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
48+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
49+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
50+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
51+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
53+
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
54+
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]])
55+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
56+
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
57+
// CHECK-NEXT: ret void
58+
//
59+
void test_tanh_bf16(global __bf16* out, __bf16 a)
60+
{
61+
*out = __builtin_amdgcn_tanh_bf16(a);
62+
}
63+
4564
// CHECK-LABEL: @test_cvt_f16_fp8(
4665
// CHECK-NEXT: entry:
4766
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
588588
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
589589
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
590590

591+
def int_amdgcn_tanh : DefaultAttrsIntrinsic<
592+
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
593+
>;
594+
591595
def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
592596
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
593597
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
541541
"Use true 16-bit registers"
542542
>;
543543

544+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
545+
"HasBF16TransInsts",
546+
"true",
547+
"Has bf16 transcendental instructions"
548+
>;
549+
544550
def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
545551
"HasBF16ConversionInsts",
546552
"true",
@@ -1967,6 +1973,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19671973
FeatureDPPSrc1SGPR,
19681974
FeatureBitOp3Insts,
19691975
FeatureTransposeLoadF4F6Insts,
1976+
FeatureBF16TransInsts,
19701977
FeatureBF16ConversionInsts,
19711978
FeatureCvtPkF16F32Inst,
19721979
FeatureMinimum3Maximum3PKF16,
@@ -2442,6 +2449,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
24422449
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
24432450
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
24442451

2452+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
2453+
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
2454+
24452455
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
24462456
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
24472457

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4006,7 +4006,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40064006
case Intrinsic::amdgcn_rsq:
40074007
case Intrinsic::amdgcn_rcp_legacy:
40084008
case Intrinsic::amdgcn_rsq_legacy:
4009-
case Intrinsic::amdgcn_rsq_clamp: {
4009+
case Intrinsic::amdgcn_rsq_clamp:
4010+
case Intrinsic::amdgcn_tanh: {
40104011
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
40114012
SDValue Src = N->getOperand(1);
40124013
return Src.isUndef() ? Src : SDValue();
@@ -6196,7 +6197,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
61966197
case Intrinsic::amdgcn_rsq:
61976198
case Intrinsic::amdgcn_rcp_legacy:
61986199
case Intrinsic::amdgcn_rsq_legacy:
6199-
case Intrinsic::amdgcn_rsq_clamp: {
6200+
case Intrinsic::amdgcn_rsq_clamp:
6201+
case Intrinsic::amdgcn_tanh: {
62006202
if (SNaN)
62016203
return true;
62026204

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
700700
break;
701701
}
702702
case Intrinsic::amdgcn_sqrt:
703-
case Intrinsic::amdgcn_rsq: {
703+
case Intrinsic::amdgcn_rsq:
704+
case Intrinsic::amdgcn_tanh: {
704705
Value *Src = II.getArgOperand(0);
705706
if (isa<PoisonValue>(Src))
706707
return IC.replaceInstUsesWith(II, Src);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45464546
case Intrinsic::amdgcn_rcp_legacy:
45474547
case Intrinsic::amdgcn_rsq_legacy:
45484548
case Intrinsic::amdgcn_rsq_clamp:
4549+
case Intrinsic::amdgcn_tanh:
45494550
case Intrinsic::amdgcn_fmul_legacy:
45504551
case Intrinsic::amdgcn_fma_legacy:
45514552
case Intrinsic::amdgcn_frexp_mant:

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
5959
bool HasCvtPkF16F32Inst = false;
6060
bool HasF32ToF16BF16ConversionSRInsts = false;
6161
bool EnableRealTrue16Insts = false;
62+
bool HasBF16TransInsts = false;
6263
bool HasBF16ConversionInsts = false;
6364
bool HasMadMixInsts = false;
6465
bool HasMadMacF32Insts = false;
@@ -202,6 +203,8 @@ class AMDGPUSubtarget {
202203
// supported and the support for fake True16 instructions is removed.
203204
bool useRealTrue16Insts() const;
204205

206+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
207+
205208
bool hasBF16ConversionInsts() const {
206209
return HasBF16ConversionInsts;
207210
}

0 commit comments

Comments
 (0)