Skip to content

Commit 7d2a58e

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_rsq_bf16 on gfx1250 (#149194)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent 038e80c commit 7d2a58e

23 files changed

+881
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
671671

672672
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
673673
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
674+
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
674675

675676
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
676677
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
421421
case AMDGPU::BI__builtin_amdgcn_rsq:
422422
case AMDGPU::BI__builtin_amdgcn_rsqf:
423423
case AMDGPU::BI__builtin_amdgcn_rsqh:
424+
case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
424425
return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
425426
case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
426427
case AMDGPU::BI__builtin_amdgcn_rsq_clampf:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
8080
*out = __builtin_amdgcn_rcp_bf16(a);
8181
}
8282

83+
// CHECK-LABEL: @test_rsq_bf16(
84+
// CHECK-NEXT: entry:
85+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
86+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
87+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
88+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
89+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
90+
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
91+
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
92+
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
93+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
94+
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
95+
// CHECK-NEXT: ret void
96+
//
97+
void test_rsq_bf16(global __bf16* out, __bf16 a)
98+
{
99+
*out = __builtin_amdgcn_rsq_bf16(a);
100+
}
101+
83102
// CHECK-LABEL: @test_cvt_f16_fp8(
84103
// CHECK-NEXT: entry:
85104
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ let SubtargetPredicate = HasBF16TransInsts in {
531531
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
532532
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
533533
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
534+
defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
534535
}
535536
} // End TRANS = 1, SchedRW = [WriteTrans32]
536537
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1141,6 +1142,7 @@ defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
11411142
defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
11421143
defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
11431144
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
1145+
defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
11441146

11451147
//===----------------------------------------------------------------------===//
11461148
// GFX10.
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
4+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
5+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
6+
7+
; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
8+
; FIXME: GlobalISel does not work with bf16
9+
10+
declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0
11+
12+
define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
13+
; SDAG-REAL16-LABEL: rsq_bf16:
14+
; SDAG-REAL16: ; %bb.0:
15+
; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
16+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
17+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
18+
; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, s2
19+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
20+
; SDAG-REAL16-NEXT: s_endpgm
21+
;
22+
; SDAG-FAKE16-LABEL: rsq_bf16:
23+
; SDAG-FAKE16: ; %bb.0:
24+
; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
25+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
26+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
27+
; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, s2
28+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
29+
; SDAG-FAKE16-NEXT: s_endpgm
30+
%rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat %src) #0
31+
store bfloat %rsq, ptr addrspace(1) %out, align 2
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
36+
; SDAG-REAL16-LABEL: rsq_bf16_constant_4:
37+
; SDAG-REAL16: ; %bb.0:
38+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
39+
; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 4.0
40+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
41+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
42+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
43+
; SDAG-REAL16-NEXT: s_endpgm
44+
;
45+
; SDAG-FAKE16-LABEL: rsq_bf16_constant_4:
46+
; SDAG-FAKE16: ; %bb.0:
47+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
48+
; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 4.0
49+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
50+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
51+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
52+
; SDAG-FAKE16-NEXT: s_endpgm
53+
%rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 4.0) #0
54+
store bfloat %rsq, ptr addrspace(1) %out, align 2
55+
ret void
56+
}
57+
58+
define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
59+
; SDAG-REAL16-LABEL: rsq_bf16_constant_100:
60+
; SDAG-REAL16: ; %bb.0:
61+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
62+
; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 0x42c8
63+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
64+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
65+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
66+
; SDAG-REAL16-NEXT: s_endpgm
67+
;
68+
; SDAG-FAKE16-LABEL: rsq_bf16_constant_100:
69+
; SDAG-FAKE16: ; %bb.0:
70+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
71+
; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 0x42c8
72+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
73+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
74+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
75+
; SDAG-FAKE16-NEXT: s_endpgm
76+
%rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 100.0) #0
77+
store bfloat %rsq, ptr addrspace(1) %out, align 2
78+
ret void
79+
}
80+
81+
define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 {
82+
; SDAG-REAL16-LABEL: rsq_undef_bf16:
83+
; SDAG-REAL16: ; %bb.0:
84+
; SDAG-REAL16-NEXT: s_endpgm
85+
;
86+
; SDAG-FAKE16-LABEL: rsq_undef_bf16:
87+
; SDAG-FAKE16: ; %bb.0:
88+
; SDAG-FAKE16-NEXT: s_endpgm
89+
%rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef)
90+
store bfloat %rsq, ptr addrspace(1) %out, align 2
91+
ret void
92+
}
93+
94+
attributes #0 = { nounwind readnone }
95+
attributes #1 = { nounwind }

llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,51 @@ v_sqrt_bf16 v5, src_scc
163163
v_sqrt_bf16 v127, 0x8000
164164
// GFX1250: v_sqrt_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
165165

166+
v_rsq_bf16 v5, v1
167+
// GFX1250: v_rsq_bf16_e32 v5, v1 ; encoding: [0x01,0xf7,0x0a,0x7e]
168+
169+
v_rsq_bf16 v5, v127
170+
// GFX1250: v_rsq_bf16_e32 v5, v127 ; encoding: [0x7f,0xf7,0x0a,0x7e]
171+
172+
v_rsq_bf16 v5, s1
173+
// GFX1250: v_rsq_bf16_e32 v5, s1 ; encoding: [0x01,0xf6,0x0a,0x7e]
174+
175+
v_rsq_bf16 v5, s105
176+
// GFX1250: v_rsq_bf16_e32 v5, s105 ; encoding: [0x69,0xf6,0x0a,0x7e]
177+
178+
v_rsq_bf16 v5, vcc_lo
179+
// GFX1250: v_rsq_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e]
180+
181+
v_rsq_bf16 v5, vcc_hi
182+
// GFX1250: v_rsq_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e]
183+
184+
v_rsq_bf16 v5, ttmp15
185+
// GFX1250: v_rsq_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e]
186+
187+
v_rsq_bf16 v5, m0
188+
// GFX1250: v_rsq_bf16_e32 v5, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e]
189+
190+
v_rsq_bf16 v5, exec_lo
191+
// GFX1250: v_rsq_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e]
192+
193+
v_rsq_bf16 v5, exec_hi
194+
// GFX1250: v_rsq_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e]
195+
196+
v_rsq_bf16 v5, null
197+
// GFX1250: v_rsq_bf16_e32 v5, null ; encoding: [0x7c,0xf6,0x0a,0x7e]
198+
199+
v_rsq_bf16 v5, -1
200+
// GFX1250: v_rsq_bf16_e32 v5, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e]
201+
202+
v_rsq_bf16 v5, 0.5
203+
// GFX1250: v_rsq_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e]
204+
205+
v_rsq_bf16 v5, src_scc
206+
// GFX1250: v_rsq_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e]
207+
208+
v_rsq_bf16 v127, 0x8000
209+
// GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
210+
166211
v_cvt_f32_bf16 v5, v1
167212
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
168213

llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,54 @@ v_sqrt_bf16 v127, 0x8000
172172
v_sqrt_bf16 v5.h, v1.h
173173
// GFX1250: v_sqrt_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf5,0x0a,0x7f]
174174

175+
v_rsq_bf16 v5, v1
176+
// GFX1250: v_rsq_bf16_e32 v5, v1 ; encoding: [0x01,0xf7,0x0a,0x7e]
177+
178+
v_rsq_bf16 v5, v127
179+
// GFX1250: v_rsq_bf16_e32 v5, v127 ; encoding: [0x7f,0xf7,0x0a,0x7e]
180+
181+
v_rsq_bf16 v5, s1
182+
// GFX1250: v_rsq_bf16_e32 v5, s1 ; encoding: [0x01,0xf6,0x0a,0x7e]
183+
184+
v_rsq_bf16 v5, s105
185+
// GFX1250: v_rsq_bf16_e32 v5, s105 ; encoding: [0x69,0xf6,0x0a,0x7e]
186+
187+
v_rsq_bf16 v5, vcc_lo
188+
// GFX1250: v_rsq_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e]
189+
190+
v_rsq_bf16 v5, vcc_hi
191+
// GFX1250: v_rsq_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e]
192+
193+
v_rsq_bf16 v5, ttmp15
194+
// GFX1250: v_rsq_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e]
195+
196+
v_rsq_bf16 v5, m0
197+
// GFX1250: v_rsq_bf16_e32 v5, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e]
198+
199+
v_rsq_bf16 v5, exec_lo
200+
// GFX1250: v_rsq_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e]
201+
202+
v_rsq_bf16 v5, exec_hi
203+
// GFX1250: v_rsq_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e]
204+
205+
v_rsq_bf16 v5, null
206+
// GFX1250: v_rsq_bf16_e32 v5, null ; encoding: [0x7c,0xf6,0x0a,0x7e]
207+
208+
v_rsq_bf16 v5, -1
209+
// GFX1250: v_rsq_bf16_e32 v5, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e]
210+
211+
v_rsq_bf16 v5, 0.5
212+
// GFX1250: v_rsq_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e]
213+
214+
v_rsq_bf16 v5, src_scc
215+
// GFX1250: v_rsq_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e]
216+
217+
v_rsq_bf16 v127, 0x8000
218+
// GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
219+
220+
v_rsq_bf16 v5.h, v1.h
221+
// GFX1250: v_rsq_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf7,0x0a,0x7f]
222+
175223
v_cvt_f32_bf16 v5, v1
176224
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
177225

llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,62 @@ v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f
170170
// GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
171171
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
172172

173+
v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
174+
// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
175+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
176+
177+
v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
178+
// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
179+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
180+
181+
v_rsq_bf16 v5, v1 row_mirror
182+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
183+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
184+
185+
v_rsq_bf16 v5, v1 row_half_mirror
186+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
187+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
188+
189+
v_rsq_bf16 v5, v1 row_shl:1
190+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
191+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
192+
193+
v_rsq_bf16 v5, v1 row_shl:15
194+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
195+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
196+
197+
v_rsq_bf16 v5, v1 row_shr:1
198+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
199+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
200+
201+
v_rsq_bf16 v5, v1 row_shr:15
202+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
203+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
204+
205+
v_rsq_bf16 v5, v1 row_ror:1
206+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
207+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
208+
209+
v_rsq_bf16 v5, v1 row_ror:15
210+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
211+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
212+
213+
v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
214+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
215+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
216+
217+
v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
218+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
219+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
220+
221+
v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
222+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
223+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
224+
225+
v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
226+
// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
227+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
228+
173229
v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
174230
// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
175231
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,66 @@ v_sqrt_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
182182
// GFX1250: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
183183
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
184184

185+
v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
186+
// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
187+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
188+
189+
v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
190+
// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
191+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
192+
193+
v_rsq_bf16 v5, v1 row_mirror
194+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
195+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
196+
197+
v_rsq_bf16 v5, v1 row_half_mirror
198+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
199+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
200+
201+
v_rsq_bf16 v5, v1 row_shl:1
202+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
203+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
204+
205+
v_rsq_bf16 v5, v1 row_shl:15
206+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
207+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
208+
209+
v_rsq_bf16 v5, v1 row_shr:1
210+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
211+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
212+
213+
v_rsq_bf16 v5, v1 row_shr:15
214+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
215+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
216+
217+
v_rsq_bf16 v5, v1 row_ror:1
218+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
219+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
220+
221+
v_rsq_bf16 v5, v1 row_ror:15
222+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
223+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
224+
225+
v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
226+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
227+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
228+
229+
v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
230+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
231+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
232+
233+
v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
234+
// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
235+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
236+
237+
v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
238+
// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
239+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
240+
241+
v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
242+
// GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
243+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
244+
185245
v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
186246
// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
187247
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

0 commit comments

Comments
 (0)