-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Enable kernarg preload on gfx1250 #153686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Enable kernarg preload on gfx1250 #153686
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 53.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153686.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 40094518dce0a..90c4f4e6680c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
TRI(*ST.getRegisterInfo()) {}
bool AMDGPUPreloadKernArgProlog::run() {
- if (!ST.hasKernargPreload())
+ if (!ST.needsKernArgPreloadProlog())
return false;
unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..3d6bfbea7727e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1573,6 +1573,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// extended VA to 57 bits.
bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+ // \returns true if the target needs to create a prolog for backward
+ // compatibility when preloading kernel arguments.
+ bool needsKernArgPreloadProlog() const {
+ return hasKernargPreload() && !GFX1250Insts;
+ }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
index 3a5507063b834..57967bc1650fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -27,7 +27,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s2
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -38,7 +38,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -49,7 +49,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -58,33 +58,21 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-REAL16: ; %bb.1:
-; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT: s_branch .LBB1_0
-; SDAG-REAL16-NEXT: .p2align 8
-; SDAG-REAL16-NEXT: ; %bb.2:
-; SDAG-REAL16-NEXT: .LBB1_0:
+; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s8
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-FAKE16: ; %bb.1:
-; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT: s_branch .LBB1_0
-; SDAG-FAKE16-NEXT: .p2align 8
-; SDAG-FAKE16-NEXT: ; %bb.2:
-; SDAG-FAKE16-NEXT: .LBB1_0:
+; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s8
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -95,7 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -106,7 +94,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -120,7 +108,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -129,7 +117,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, 0x64
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -138,7 +126,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -147,7 +135,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, 0x64
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
store i16 %cvt, ptr %out, align 2
@@ -163,7 +151,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -174,7 +162,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s2
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -185,7 +173,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -196,7 +184,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -205,33 +193,21 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-REAL16: ; %bb.1:
-; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT: s_branch .LBB4_0
-; SDAG-REAL16-NEXT: .p2align 8
-; SDAG-REAL16-NEXT: ; %bb.2:
-; SDAG-REAL16-NEXT: .LBB4_0:
+; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s8
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-FAKE16: ; %bb.1:
-; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT: s_branch .LBB4_0
-; SDAG-FAKE16-NEXT: .p2align 8
-; SDAG-FAKE16-NEXT: ; %bb.2:
-; SDAG-FAKE16-NEXT: .LBB4_0:
+; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s8
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -242,7 +218,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -253,7 +229,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -267,7 +243,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -276,7 +252,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, 0x64
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -285,7 +261,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -294,7 +270,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, 0x64
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
store i16 %cvt, ptr %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 79b531e3ce785..c87f723086a41 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
; GFX942-LABEL: preload_block_count_x:
@@ -30,6 +31,12 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -65,6 +72,12 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s12
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_unused_arg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -101,6 +114,14 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x28
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -127,6 +148,14 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: no_inreg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -156,6 +185,16 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: mixed_inreg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -192,6 +231,15 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i64_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
store i64 %load, ptr addrspace(1) %out
@@ -228,6 +276,14 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i16_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] offset:8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
store i16 %load, ptr addrspace(1) %out
@@ -261,6 +317,12 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_y:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
%load = load i32, ptr addrspace(4) %gep
@@ -300,6 +362,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: random_incorrect_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0xa
+; GFX1250-NEXT...
[truncated]
|
kerbowa
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/11454 Here is the relevant piece of the build log for the reference |

No description provided.