Skip to content

Conversation

@rampitec
Copy link
Collaborator

No description provided.

@rampitec rampitec requested a review from shiltian August 14, 2025 18:50
@rampitec rampitec marked this pull request as ready for review August 14, 2025 18:50
Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Aug 14, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/153645.diff

9 Files Affected:

  • (modified) llvm/docs/AMDGPUUsage.rst (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+4-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUFeatures.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+4-2)
  • (modified) llvm/test/CodeGen/AMDGPU/extra-lds-size.ll (+7)
  • (added) llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll (+13)
  • (added) llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll (+72)
  • (added) llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll (+61)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5343d66b083c7..8d0786ab0440d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        roundup(lds-size / (128 * 4))
                                                      GFX950
                                                        roundup(lds-size / (320 * 4))
+                                                     GFX125*
+                                                       roundup(lds-size / (256 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f26639847be75..41ab3e6b9dc75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
+   FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
    FeatureCUStores,
+   FeatureAddressableLocalMemorySize327680,
    FeatureCuMode,
    Feature64BitLiterals,
    FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 626734a4752f3..9040a4af9f91e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1103,7 +1103,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+    // LDS is allocated in 256 dword blocks.
+    LDSAlignShift = 10;
+  } else if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
     // LDS is allocated in 320 dword blocks.
     LDSAlignShift = 11;
   } else if (STM.getFeatureBits().test(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1faeb6f545..d14b5ce80d28e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
 def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e0ac040bdd226..ec9f1abdd8467 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 65536;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
     return 163840;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+    return 327680;
   return 0;
 }
 
@@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
 }
 
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
-  // Currently this is 128 for all subtargets
-  return 128;
+  return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+                                                                        : 128;
 }
 
 bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e1ce5341efdd1..4349b18fd394c 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -6,6 +6,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
@@ -29,6 +31,11 @@
 ; GFX1200-MESA: .long 45100
 ; GFX1200-MESA-NEXT: .long 1024
 
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX1250-MESA: .long 45100
+; GFX1250-MESA-NEXT: .long 512
+
 @lds = internal addrspace(3) global [4096 x i8] poison
 
 define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
new file mode 100644
index 0000000000000..da92dcdd7104e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX1250 supports upto 320 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
+@dst = addrspace(3) global [81921 x i32] undef
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
new file mode 100644
index 0000000000000..3db0fa8f21759
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+; GCN-LABEL: test_lds_i8:
+; GCN: .amdhsa_group_segment_fixed_size 1
+; GCN: ; LDSByteSize: 1 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i8(i8 %val) {
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+; GCN-LABEL: test_lds_i16:
+; GCN: .amdhsa_group_segment_fixed_size 2
+; GCN: ; LDSByteSize: 2 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i16(i16 %val) {
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+; GCN-LABEL: test_lds_i32:
+; GCN: .amdhsa_group_segment_fixed_size 4
+; GCN: ; LDSByteSize: 4 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i8:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i8() {
+  %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+  %val = load i8, ptr addrspace(3) %gep
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i16:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i16() {
+  %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+  %val = load i16, ptr addrspace(3) %gep
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i32:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
new file mode 100644
index 0000000000000..bfa7d37ce63a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i16:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_array_i32:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_array_i8:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_i16:
+; PAL: .lds_size:       0x2
+; PAL: test_lds_i32:
+; PAL: .lds_size:       0x4
+; PAL: test_lds_i8:
+; PAL: .lds_size:       0x1
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+define amdgpu_gfx void @test_lds_i8(i8 %val) {
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_i16(i16 %val) {
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i8() {
+  %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+  %val = load i8, ptr addrspace(3) %gep
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i16() {
+  %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+  %val = load i16, ptr addrspace(3) %gep
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}

@github-actions
Copy link

github-actions bot commented Aug 14, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@rampitec rampitec force-pushed the users/rampitec/08-14-_amdgpu_increase_lds_to_320k_on_gfx1250 branch from 7dab587 to 1509cf7 Compare August 14, 2025 18:59
@rampitec rampitec merged commit 49f2093 into main Aug 14, 2025
10 checks passed
@rampitec rampitec deleted the users/rampitec/08-14-_amdgpu_increase_lds_to_320k_on_gfx1250 branch August 14, 2025 19:52
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants