-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Increase LDS to 320K on gfx1250 #153645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
rampitec
merged 1 commit into
main
from
users/rampitec/08-14-_amdgpu_increase_lds_to_320k_on_gfx1250
Aug 14, 2025
Merged
[AMDGPU] Increase LDS to 320K on gfx1250 #153645
rampitec
merged 1 commit into
main
from
users/rampitec/08-14-_amdgpu_increase_lds_to_320k_on_gfx1250
Aug 14, 2025
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Member
|
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesFull diff: https://github.com/llvm/llvm-project/pull/153645.diff 9 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5343d66b083c7..8d0786ab0440d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
roundup(lds-size / (128 * 4))
GFX950
roundup(lds-size / (320 * 4))
+ GFX125*
+ roundup(lds-size / (256 * 4))
24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
_INVALID_OPERATION with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f26639847be75..41ab3e6b9dc75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
+ FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureCUStores,
+ FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 626734a4752f3..9040a4af9f91e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1103,7 +1103,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+ if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+ // LDS is allocated in 256 dword blocks.
+ LDSAlignShift = 10;
+ } else if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1faeb6f545..d14b5ce80d28e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e0ac040bdd226..ec9f1abdd8467 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
+ if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 327680;
return 0;
}
@@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- // Currently this is 128 for all subtargets
- return 128;
+ return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+ : 128;
}
bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e1ce5341efdd1..4349b18fd394c 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
@@ -29,6 +31,11 @@
; GFX1200-MESA: .long 45100
; GFX1200-MESA-NEXT: .long 1024
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX1250-MESA: .long 45100
+; GFX1250-MESA-NEXT: .long 512
+
@lds = internal addrspace(3) global [4096 x i8] poison
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
new file mode 100644
index 0000000000000..da92dcdd7104e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX1250 supports upto 320 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
+@dst = addrspace(3) global [81921 x i32] undef
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+ %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+ store i32 %val, ptr addrspace(3) %gep
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
new file mode 100644
index 0000000000000..3db0fa8f21759
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+; GCN-LABEL: test_lds_i8:
+; GCN: .amdhsa_group_segment_fixed_size 1
+; GCN: ; LDSByteSize: 1 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_i16:
+; GCN: .amdhsa_group_segment_fixed_size 2
+; GCN: ; LDSByteSize: 2 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_i32:
+; GCN: .amdhsa_group_segment_fixed_size 4
+; GCN: ; LDSByteSize: 4 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i8:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i16:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i32:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
new file mode 100644
index 0000000000000..bfa7d37ce63a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i16:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i32:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i8:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_i16:
+; PAL: .lds_size: 0x2
+; PAL: test_lds_i32:
+; PAL: .lds_size: 0x4
+; PAL: test_lds_i8:
+; PAL: .lds_size: 0x1
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+define amdgpu_gfx void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
7dab587 to
1509cf7
Compare
shiltian
approved these changes
Aug 14, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.

No description provided.