Skip to content

Commit 23ac7b9

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_sqrt_bf16 on gfx1250 (#148921)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent dabc8e2 commit 23ac7b9

22 files changed

+909
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
938938
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
939939
}
940940

941+
if (Subtarget->hasBF16TransInsts()) {
942+
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
943+
}
944+
941945
if (Subtarget->hasCvtPkF16F32Inst()) {
942946
setOperationAction(ISD::FP_ROUND,
943947
{MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
530530
let SubtargetPredicate = HasBF16TransInsts in {
531531
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
532532
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
533+
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
533534
}
534535
} // End TRANS = 1, SchedRW = [WriteTrans32]
535536
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1139,6 +1140,7 @@ defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
11391140
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
11401141
defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
11411142
defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
1143+
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
11421144

11431145
//===----------------------------------------------------------------------===//
11441146
// GFX10.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
2+
; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
3+
4+
; FIXME: GlobalISel does not work with bf16
5+
6+
declare bfloat @llvm.amdgcn.sqrt.bf16(bfloat) #0
7+
8+
; GCN-LABEL: {{^}}sqrt_bf16:
9+
; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
10+
define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
11+
%sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat %src) #0
12+
store bfloat %sqrt, ptr addrspace(1) %out, align 2
13+
ret void
14+
}
15+
16+
; GCN-LABEL: {{^}}sqrt_bf16_constant_4
17+
; GCN: v_sqrt_bf16_e32 v0, 4.0
18+
define amdgpu_kernel void @sqrt_bf16_constant_4(ptr addrspace(1) %out) #1 {
19+
%sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 4.0) #0
20+
store bfloat %sqrt, ptr addrspace(1) %out, align 2
21+
ret void
22+
}
23+
24+
; GCN-LABEL: {{^}}sqrt_bf16_constant_100
25+
; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, 0x42c8
26+
define amdgpu_kernel void @sqrt_bf16_constant_100(ptr addrspace(1) %out) #1 {
27+
%sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 100.0) #0
28+
store bfloat %sqrt, ptr addrspace(1) %out, align 2
29+
ret void
30+
}
31+
32+
attributes #0 = { nounwind readnone }
33+
attributes #1 = { nounwind }
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
3+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
4+
5+
; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
6+
7+
declare bfloat @llvm.sqrt.bf16(bfloat %a)
8+
declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
9+
10+
define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
11+
; GFX12-TRUE16-LABEL: sqrt_bf16:
12+
; GFX12-TRUE16: ; %bb.0: ; %entry
13+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
14+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
15+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
16+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
17+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
18+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
19+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
20+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
21+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
22+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
23+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
24+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
25+
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
26+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
27+
; GFX12-TRUE16-NEXT: s_endpgm
28+
;
29+
; GFX12-FAKE16-LABEL: sqrt_bf16:
30+
; GFX12-FAKE16: ; %bb.0: ; %entry
31+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
32+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
33+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
34+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
35+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
36+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
37+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
38+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
39+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
40+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
41+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
42+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
43+
; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
44+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
45+
; GFX12-FAKE16-NEXT: s_endpgm
46+
entry:
47+
%a.val = load bfloat, ptr addrspace(1) %a
48+
%r.val = call bfloat @llvm.sqrt.bf16(bfloat %a.val)
49+
store bfloat %r.val, ptr addrspace(1) %r
50+
ret void
51+
}
52+
53+
define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
54+
; GFX12-TRUE16-LABEL: sqrt_v2bf16:
55+
; GFX12-TRUE16: ; %bb.0: ; %entry
56+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
57+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
58+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
59+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
60+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
61+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
62+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
63+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
64+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
65+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
66+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
67+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
68+
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l
69+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
70+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
71+
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
72+
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
73+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
74+
; GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
75+
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
76+
; GFX12-TRUE16-NEXT: s_endpgm
77+
;
78+
; GFX12-FAKE16-LABEL: sqrt_v2bf16:
79+
; GFX12-FAKE16: ; %bb.0: ; %entry
80+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
81+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
82+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
83+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
84+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
85+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
86+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
87+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
88+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
89+
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
90+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
91+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
92+
; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
93+
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
94+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
95+
; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
96+
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
97+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
98+
; GFX12-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
99+
; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
100+
; GFX12-FAKE16-NEXT: s_endpgm
101+
entry:
102+
%a.val = load <2 x bfloat>, ptr addrspace(1) %a
103+
%r.val = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a.val)
104+
store <2 x bfloat> %r.val, ptr addrspace(1) %r
105+
ret void
106+
}

llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,51 @@ v_rcp_bf16 v5, src_scc
118118
v_rcp_bf16 v127, 0x8000
119119
// GFX1250: v_rcp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
120120

121+
v_sqrt_bf16 v5, v1
122+
// GFX1250: v_sqrt_bf16_e32 v5, v1 ; encoding: [0x01,0xf5,0x0a,0x7e]
123+
124+
v_sqrt_bf16 v5, v127
125+
// GFX1250: v_sqrt_bf16_e32 v5, v127 ; encoding: [0x7f,0xf5,0x0a,0x7e]
126+
127+
v_sqrt_bf16 v5, s1
128+
// GFX1250: v_sqrt_bf16_e32 v5, s1 ; encoding: [0x01,0xf4,0x0a,0x7e]
129+
130+
v_sqrt_bf16 v5, s105
131+
// GFX1250: v_sqrt_bf16_e32 v5, s105 ; encoding: [0x69,0xf4,0x0a,0x7e]
132+
133+
v_sqrt_bf16 v5, vcc_lo
134+
// GFX1250: v_sqrt_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf4,0x0a,0x7e]
135+
136+
v_sqrt_bf16 v5, vcc_hi
137+
// GFX1250: v_sqrt_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf4,0x0a,0x7e]
138+
139+
v_sqrt_bf16 v5, ttmp15
140+
// GFX1250: v_sqrt_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf4,0x0a,0x7e]
141+
142+
v_sqrt_bf16 v5, m0
143+
// GFX1250: v_sqrt_bf16_e32 v5, m0 ; encoding: [0x7d,0xf4,0x0a,0x7e]
144+
145+
v_sqrt_bf16 v5, exec_lo
146+
// GFX1250: v_sqrt_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf4,0x0a,0x7e]
147+
148+
v_sqrt_bf16 v5, exec_hi
149+
// GFX1250: v_sqrt_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf4,0x0a,0x7e]
150+
151+
v_sqrt_bf16 v5, null
152+
// GFX1250: v_sqrt_bf16_e32 v5, null ; encoding: [0x7c,0xf4,0x0a,0x7e]
153+
154+
v_sqrt_bf16 v5, -1
155+
// GFX1250: v_sqrt_bf16_e32 v5, -1 ; encoding: [0xc1,0xf4,0x0a,0x7e]
156+
157+
v_sqrt_bf16 v5, 0.5
158+
// GFX1250: v_sqrt_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf4,0x0a,0x7e]
159+
160+
v_sqrt_bf16 v5, src_scc
161+
// GFX1250: v_sqrt_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf4,0x0a,0x7e]
162+
163+
v_sqrt_bf16 v127, 0x8000
164+
// GFX1250: v_sqrt_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
165+
121166
v_cvt_f32_bf16 v5, v1
122167
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
123168

llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,54 @@ v_rcp_bf16 v127, 0x8000
124124
v_rcp_bf16 v5.h, v1.h
125125
// GFX1250: v_rcp_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf3,0x0a,0x7f]
126126

127+
v_sqrt_bf16 v5, v1
128+
// GFX1250: v_sqrt_bf16_e32 v5, v1 ; encoding: [0x01,0xf5,0x0a,0x7e]
129+
130+
v_sqrt_bf16 v5, v127
131+
// GFX1250: v_sqrt_bf16_e32 v5, v127 ; encoding: [0x7f,0xf5,0x0a,0x7e]
132+
133+
v_sqrt_bf16 v5, s1
134+
// GFX1250: v_sqrt_bf16_e32 v5, s1 ; encoding: [0x01,0xf4,0x0a,0x7e]
135+
136+
v_sqrt_bf16 v5, s105
137+
// GFX1250: v_sqrt_bf16_e32 v5, s105 ; encoding: [0x69,0xf4,0x0a,0x7e]
138+
139+
v_sqrt_bf16 v5, vcc_lo
140+
// GFX1250: v_sqrt_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf4,0x0a,0x7e]
141+
142+
v_sqrt_bf16 v5, vcc_hi
143+
// GFX1250: v_sqrt_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf4,0x0a,0x7e]
144+
145+
v_sqrt_bf16 v5, ttmp15
146+
// GFX1250: v_sqrt_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf4,0x0a,0x7e]
147+
148+
v_sqrt_bf16 v5, m0
149+
// GFX1250: v_sqrt_bf16_e32 v5, m0 ; encoding: [0x7d,0xf4,0x0a,0x7e]
150+
151+
v_sqrt_bf16 v5, exec_lo
152+
// GFX1250: v_sqrt_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf4,0x0a,0x7e]
153+
154+
v_sqrt_bf16 v5, exec_hi
155+
// GFX1250: v_sqrt_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf4,0x0a,0x7e]
156+
157+
v_sqrt_bf16 v5, null
158+
// GFX1250: v_sqrt_bf16_e32 v5, null ; encoding: [0x7c,0xf4,0x0a,0x7e]
159+
160+
v_sqrt_bf16 v5, -1
161+
// GFX1250: v_sqrt_bf16_e32 v5, -1 ; encoding: [0xc1,0xf4,0x0a,0x7e]
162+
163+
v_sqrt_bf16 v5, 0.5
164+
// GFX1250: v_sqrt_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf4,0x0a,0x7e]
165+
166+
v_sqrt_bf16 v5, src_scc
167+
// GFX1250: v_sqrt_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf4,0x0a,0x7e]
168+
169+
v_sqrt_bf16 v127, 0x8000
170+
// GFX1250: v_sqrt_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
171+
172+
v_sqrt_bf16 v5.h, v1.h
173+
// GFX1250: v_sqrt_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf5,0x0a,0x7f]
174+
127175
v_cvt_f32_bf16 v5, v1
128176
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
129177

llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,62 @@ v_rcp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi
114114
// GFX1250: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
115115
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
116116

117+
v_sqrt_bf16 v5, v1 quad_perm:[3,2,1,0]
118+
// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
119+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
120+
121+
v_sqrt_bf16 v5, v1 quad_perm:[0,1,2,3]
122+
// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
123+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
124+
125+
v_sqrt_bf16 v5, v1 row_mirror
126+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
127+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
128+
129+
v_sqrt_bf16 v5, v1 row_half_mirror
130+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
131+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
132+
133+
v_sqrt_bf16 v5, v1 row_shl:1
134+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
135+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
136+
137+
v_sqrt_bf16 v5, v1 row_shl:15
138+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
139+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
140+
141+
v_sqrt_bf16 v5, v1 row_shr:1
142+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
143+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
144+
145+
v_sqrt_bf16 v5, v1 row_shr:15
146+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
147+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
148+
149+
v_sqrt_bf16 v5, v1 row_ror:1
150+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
151+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
152+
153+
v_sqrt_bf16 v5, v1 row_ror:15
154+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
155+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
156+
157+
v_sqrt_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
158+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
159+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
160+
161+
v_sqrt_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
162+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
163+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
164+
165+
v_sqrt_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
166+
// GFX1250: v_sqrt_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
167+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
168+
169+
v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
170+
// GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
171+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
172+
117173
v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
118174
// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
119175
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

0 commit comments

Comments
 (0)