Skip to content

Commit ad6d5d2

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_log_bf16 on gfx1250 (#149201)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent 7d2a58e commit ad6d5d2

25 files changed

+1090
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
672672
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
673673
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
674674
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
675+
TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
675676

676677
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
677678
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
436436
case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
437437
return EmitAMDGPUDispatchPtr(*this, E);
438438
case AMDGPU::BI__builtin_amdgcn_logf:
439+
case AMDGPU::BI__builtin_amdgcn_log_bf16:
439440
return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
440441
case AMDGPU::BI__builtin_amdgcn_exp2f:
441442
return emitBuiltinWithOneOverloadedType<1>(*this, E,

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,25 @@ void test_rsq_bf16(global __bf16* out, __bf16 a)
9999
*out = __builtin_amdgcn_rsq_bf16(a);
100100
}
101101

102+
// CHECK-LABEL: @test_log_bf16(
103+
// CHECK-NEXT: entry:
104+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
105+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
106+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
107+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
108+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
109+
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
110+
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
111+
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
112+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
113+
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
114+
// CHECK-NEXT: ret void
115+
//
116+
void test_log_bf16(global __bf16* out, __bf16 a)
117+
{
118+
*out = __builtin_amdgcn_log_bf16(a);
119+
}
120+
102121
// CHECK-LABEL: @test_cvt_f16_fp8(
103122
// CHECK-NEXT: entry:
104123
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
532532
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
533533
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
534534
defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
535+
defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
535536
}
536537
} // End TRANS = 1, SchedRW = [WriteTrans32]
537538
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1143,6 +1144,7 @@ defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
11431144
defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
11441145
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
11451146
defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
1147+
defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
11461148

11471149
//===----------------------------------------------------------------------===//
11481150
// GFX10.

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
3+
4+
; TODO: Add global-isel when it can support bf16
5+
6+
define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
7+
; GCN-LABEL: llvm_log2_bf16_v:
8+
; GCN: ; %bb.0:
9+
; GCN-NEXT: v_log_bf16_e32 v2, v2
10+
; GCN-NEXT: global_store_b16 v[0:1], v2, off
11+
; GCN-NEXT: s_endpgm
12+
%log = call bfloat @llvm.log2.bf16(bfloat %src)
13+
store bfloat %log, ptr addrspace(1) %out, align 2
14+
ret void
15+
}
16+
17+
define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
18+
; GCN-LABEL: llvm_log2_bf16_s:
19+
; GCN: ; %bb.0:
20+
; GCN-NEXT: v_log_bf16_e32 v2, s0
21+
; GCN-NEXT: global_store_b16 v[0:1], v2, off
22+
; GCN-NEXT: s_endpgm
23+
%log = call bfloat @llvm.log2.bf16(bfloat %src)
24+
store bfloat %log, ptr addrspace(1) %out, align 2
25+
ret void
26+
}
27+
28+
declare bfloat @llvm.log2.bf16(bfloat)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
2+
; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
3+
4+
; FIXME: GlobalISel does not work with bf16
5+
6+
declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0
7+
8+
; GCN-LABEL: {{^}}log_bf16:
9+
; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
10+
define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
11+
%log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0
12+
store bfloat %log, ptr addrspace(1) %out, align 2
13+
ret void
14+
}
15+
16+
; GCN-LABEL: {{^}}log_bf16_constant_4
17+
; GCN: v_log_bf16_e32 v0, 4.0
18+
define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 {
19+
%log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0
20+
store bfloat %log, ptr addrspace(1) %out, align 2
21+
ret void
22+
}
23+
24+
; GCN-LABEL: {{^}}log_bf16_constant_100
25+
; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8
26+
define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 {
27+
%log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0
28+
store bfloat %log, ptr addrspace(1) %out, align 2
29+
ret void
30+
}
31+
32+
attributes #0 = { nounwind readnone }
33+
attributes #1 = { nounwind }
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-TRUE16 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-FAKE16 %s
4+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-TRUE16 %s
5+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-FAKE16 %s
6+
7+
define bfloat @v_log2_bf16(bfloat %in) {
8+
; GFX-SDAG-TRUE16-LABEL: v_log2_bf16:
9+
; GFX-SDAG-TRUE16: ; %bb.0:
10+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
11+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
12+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
13+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
14+
;
15+
; GFX-SDAG-FAKE16-LABEL: v_log2_bf16:
16+
; GFX-SDAG-FAKE16: ; %bb.0:
17+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
18+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
19+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
20+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
21+
%result = call bfloat @llvm.log2.bf16(bfloat %in)
22+
ret bfloat %result
23+
}
24+
25+
define bfloat @v_log2_fabs_bf16(bfloat %in) {
26+
; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_bf16:
27+
; GFX-SDAG-TRUE16: ; %bb.0:
28+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
29+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
30+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, |v0.l|
31+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
32+
;
33+
; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_bf16:
34+
; GFX-SDAG-FAKE16: ; %bb.0:
35+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
36+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
37+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, |v0|
38+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
39+
%fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
40+
%result = call bfloat @llvm.log2.bf16(bfloat %fabs)
41+
ret bfloat %result
42+
}
43+
44+
define bfloat @v_log2_fneg_fabs_bf16(bfloat %in) {
45+
; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_bf16:
46+
; GFX-SDAG-TRUE16: ; %bb.0:
47+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
48+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
49+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -|v0.l|
50+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
51+
;
52+
; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_bf16:
53+
; GFX-SDAG-FAKE16: ; %bb.0:
54+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
55+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
56+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -|v0|
57+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
58+
%fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
59+
%fneg.fabs = fneg bfloat %fabs
60+
%result = call bfloat @llvm.log2.bf16(bfloat %fneg.fabs)
61+
ret bfloat %result
62+
}
63+
64+
define bfloat @v_log2_fneg_bf16(bfloat %in) {
65+
; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_bf16:
66+
; GFX-SDAG-TRUE16: ; %bb.0:
67+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
68+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
69+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l
70+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
71+
;
72+
; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_bf16:
73+
; GFX-SDAG-FAKE16: ; %bb.0:
74+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
75+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
76+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
77+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
78+
%fneg = fneg bfloat %in
79+
%result = call bfloat @llvm.log2.bf16(bfloat %fneg)
80+
ret bfloat %result
81+
}
82+
83+
define bfloat @v_log2_bf16_fast(bfloat %in) {
84+
; GFX-SDAG-TRUE16-LABEL: v_log2_bf16_fast:
85+
; GFX-SDAG-TRUE16: ; %bb.0:
86+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
87+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
88+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
89+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
90+
;
91+
; GFX-SDAG-FAKE16-LABEL: v_log2_bf16_fast:
92+
; GFX-SDAG-FAKE16: ; %bb.0:
93+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
94+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
95+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
96+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
97+
%result = call fast bfloat @llvm.log2.bf16(bfloat %in)
98+
ret bfloat %result
99+
}
100+
101+
define <2 x bfloat> @v_log2_v2bf16(<2 x bfloat> %in) {
102+
; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16:
103+
; GFX-SDAG-TRUE16: ; %bb.0:
104+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
106+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h
107+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
108+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
109+
;
110+
; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16:
111+
; GFX-SDAG-FAKE16: ; %bb.0:
112+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
113+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
114+
; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
115+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
116+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
117+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
118+
; GFX-SDAG-FAKE16-NEXT: v_nop
119+
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
120+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
121+
%result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
122+
ret <2 x bfloat> %result
123+
}
124+
125+
define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) {
126+
; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_v2bf16:
127+
; GFX-SDAG-TRUE16: ; %bb.0:
128+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
129+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
130+
; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
131+
; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
132+
; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
133+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v1.l
134+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v2.l
135+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
136+
;
137+
; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16:
138+
; GFX-SDAG-FAKE16: ; %bb.0:
139+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
140+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
141+
; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
142+
; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
143+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
144+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
145+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
146+
; GFX-SDAG-FAKE16-NEXT: v_nop
147+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
148+
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
149+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
150+
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
151+
%result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs)
152+
ret <2 x bfloat> %result
153+
}
154+
155+
define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
156+
; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2bf16:
157+
; GFX-SDAG-TRUE16: ; %bb.0:
158+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
159+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
160+
; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
161+
; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
162+
; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
163+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v1.l
164+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v2.l
165+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
166+
;
167+
; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16:
168+
; GFX-SDAG-FAKE16: ; %bb.0:
169+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
170+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
171+
; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
172+
; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
173+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
174+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1
175+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
176+
; GFX-SDAG-FAKE16-NEXT: v_nop
177+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
178+
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
179+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
180+
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
181+
%fneg.fabs = fneg <2 x bfloat> %fabs
182+
%result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg.fabs)
183+
ret <2 x bfloat> %result
184+
}
185+
186+
define <2 x bfloat> @v_log2_fneg_v2bf16(<2 x bfloat> %in) {
187+
; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_v2bf16:
188+
; GFX-SDAG-TRUE16: ; %bb.0:
189+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
190+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
191+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v0.h
192+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l
193+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
194+
;
195+
; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_v2bf16:
196+
; GFX-SDAG-FAKE16: ; %bb.0:
197+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
198+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
199+
; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
200+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
201+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
202+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1
203+
; GFX-SDAG-FAKE16-NEXT: v_nop
204+
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
205+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
206+
%fneg = fneg <2 x bfloat> %in
207+
%result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg)
208+
ret <2 x bfloat> %result
209+
}
210+
211+
define <2 x bfloat> @v_log2_v2bf16_fast(<2 x bfloat> %in) {
212+
; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16_fast:
213+
; GFX-SDAG-TRUE16: ; %bb.0:
214+
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
215+
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
216+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h
217+
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
218+
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
219+
;
220+
; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16_fast:
221+
; GFX-SDAG-FAKE16: ; %bb.0:
222+
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
223+
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
224+
; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
225+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
226+
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
227+
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
228+
; GFX-SDAG-FAKE16-NEXT: v_nop
229+
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
230+
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
231+
%result = call fast <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
232+
ret <2 x bfloat> %result
233+
}
234+
235+
declare bfloat @llvm.log2.bf16(bfloat) #0
236+
declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat>) #0
237+
declare bfloat @llvm.fabs.bf16(bfloat) #0
238+
declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
239+
240+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

0 commit comments

Comments
 (0)