Skip to content

Conversation

@rampitec
Copy link
Collaborator

No description provided.

Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@rampitec rampitec requested review from kerbowa and shiltian August 14, 2025 21:14
@rampitec rampitec marked this pull request as ready for review August 14, 2025 21:14
@llvmbot
Copy link
Member

llvmbot commented Aug 14, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Patch is 53.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153686.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+6)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll (+28-52)
  • (modified) llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll (+183)
  • (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+313-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 40094518dce0a..90c4f4e6680c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
       TRI(*ST.getRegisterInfo()) {}
 
 bool AMDGPUPreloadKernArgProlog::run() {
-  if (!ST.hasKernargPreload())
+  if (!ST.needsKernArgPreloadProlog())
     return false;
 
   unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..3d6bfbea7727e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1573,6 +1573,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // extended VA to 57 bits.
   bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
 
+  // \returns true if the target needs to create a prolog for backward
+  // compatibility when preloading kernel arguments.
+  bool needsKernArgPreloadProlog() const {
+    return hasKernargPreload() && !GFX1250Insts;
+  }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
index 3a5507063b834..57967bc1650fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -27,7 +27,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s2
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -38,7 +38,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -49,7 +49,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -58,33 +58,21 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 
 define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-REAL16:       ; %bb.1:
-; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT:    s_branch .LBB1_0
-; SDAG-REAL16-NEXT:    .p2align 8
-; SDAG-REAL16-NEXT:  ; %bb.2:
-; SDAG-REAL16-NEXT:  .LBB1_0:
+; SDAG-REAL16:       ; %bb.0:
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-FAKE16:       ; %bb.1:
-; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT:    s_branch .LBB1_0
-; SDAG-FAKE16-NEXT:    .p2align 8
-; SDAG-FAKE16-NEXT:  ; %bb.2:
-; SDAG-FAKE16-NEXT:  .LBB1_0:
+; SDAG-FAKE16:       ; %bb.0:
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s8
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -95,7 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -106,7 +94,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -120,7 +108,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -129,7 +117,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, 0x64
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -138,7 +126,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -147,7 +135,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
   store i16 %cvt, ptr %out, align 2
@@ -163,7 +151,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -174,7 +162,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s2
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -185,7 +173,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -196,7 +184,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -205,33 +193,21 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 
 define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-REAL16:       ; %bb.1:
-; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT:    s_branch .LBB4_0
-; SDAG-REAL16-NEXT:    .p2align 8
-; SDAG-REAL16-NEXT:  ; %bb.2:
-; SDAG-REAL16-NEXT:  .LBB4_0:
+; SDAG-REAL16:       ; %bb.0:
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-FAKE16:       ; %bb.1:
-; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT:    s_branch .LBB4_0
-; SDAG-FAKE16-NEXT:    .p2align 8
-; SDAG-FAKE16-NEXT:  ; %bb.2:
-; SDAG-FAKE16-NEXT:  .LBB4_0:
+; SDAG-FAKE16:       ; %bb.0:
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s8
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -242,7 +218,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -253,7 +229,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -267,7 +243,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -276,7 +252,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, 0x64
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -285,7 +261,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -294,7 +270,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
   store i16 %cvt, ptr %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 79b531e3ce785..c87f723086a41 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 
 define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX942-LABEL: preload_block_count_x:
@@ -30,6 +31,12 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -65,6 +72,12 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_unused_arg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -101,6 +114,14 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[14:15]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x28
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -127,6 +148,14 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: no_inreg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -156,6 +185,16 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: mixed_inreg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -192,6 +231,15 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i64_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i64, ptr addrspace(4) %imp_arg_ptr
   store i64 %load, ptr addrspace(1) %out
@@ -228,6 +276,14 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i16_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1] offset:8
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i16, ptr addrspace(4) %imp_arg_ptr
   store i16 %load, ptr addrspace(1) %out
@@ -261,6 +317,12 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_y:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
   %load = load i32, ptr addrspace(4) %gep
@@ -300,6 +362,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: random_incorrect_offset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0xa
+; GFX1250-NEXT...
[truncated]

Copy link
Member

@kerbowa kerbowa left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@rampitec rampitec merged commit 8bce10a into main Aug 14, 2025
13 checks passed
@rampitec rampitec deleted the users/rampitec/08-14-_amdgpu_enable_kernarg_preload_on_gfx1250 branch August 14, 2025 23:29
@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 14, 2025

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime-2 running on rocm-worker-hw-02 while building llvm at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/11454

Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libarcher :: races/task-two.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 13
/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/./bin/clang -fopenmp  -gdwarf-4 -O1 -fsanitize=thread  -I /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests -I /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src   /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c -o /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/deflake.bash /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp 2>&1 | tee /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp.log | /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/./bin/FileCheck /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c
# executed command: /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/./bin/clang -fopenmp -gdwarf-4 -O1 -fsanitize=thread -I /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests -I /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/runtime/src /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c -o /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp -latomic
# note: command had no output on stdout or stderr
# executed command: env TSAN_OPTIONS=ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1 /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/deflake.bash /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp
# note: command had no output on stdout or stderr
# executed command: tee /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/runtimes/runtimes-bins/openmp/tools/archer/tests/races/Output/task-two.c.tmp.log
# note: command had no output on stdout or stderr
# executed command: /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/./bin/FileCheck /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c
# .---command stderr------------
# | /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c:44:11: error: CHECK: expected string not found in input
# | // CHECK: ThreadSanitizer: reported {{[0-9]+}} warnings
# |           ^
# | <stdin>:27:5: note: scanning from here
# | DONE
# |     ^
# | <stdin>:28:1: note: possible intended match here
# | ThreadSanitizer: thread T4 finished with ignores enabled, created at:
# | ^
# | 
# | Input file: <stdin>
# | Check file: /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |            22:  #0 pthread_create /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:1091:3 (task-two.c.tmp+0xa37da) 
# |            23:  #1 __kmp_create_worker z_Linux_util.cpp (libomp.so+0xcb732) 
# |            24:  
# |            25: SUMMARY: ThreadSanitizer: data race /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/openmp/tools/archer/tests/races/task-two.c:30:10 in .omp_outlined. 
# |            26: ================== 
# |            27: DONE 
# | check:44'0         X error: no match found
# |            28: ThreadSanitizer: thread T4 finished with ignores enabled, created at: 
# | check:44'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:44'1     ?                                                                      possible intended match
# |            29:  #0 pthread_create /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:1091:3 (task-two.c.tmp+0xa37da) 
# | check:44'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            30:  #1 __kmp_create_worker z_Linux_util.cpp (libomp.so+0xcb732) 
# | check:44'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            31:  
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants