Skip to content

Commit 49f2093

Browse files
authored
[AMDGPU] Increase LDS to 320K on gfx1250 (#153645)
1 parent 334a046 commit 49f2093

File tree

9 files changed

+168
-4
lines changed

9 files changed

+168
-4
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
55985598
roundup(lds-size / (128 * 4))
55995599
GFX950
56005600
roundup(lds-size / (320 * 4))
5601+
GFX125*
5602+
roundup(lds-size / (256 * 4))
56015603

56025604
24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
56035605
_INVALID_OPERATION with specified exceptions

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
15481548

15491549
def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
15501550
"gfx12",
1551-
[FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
1551+
[FeatureFP64, FeatureMIMG_R128,
15521552
FeatureFlatAddressSpace, Feature16BitInsts,
15531553
FeatureInv2PiInlineImm, FeatureApertureRegs,
15541554
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
19771977

19781978
def FeatureISAVersion12 : FeatureSet<
19791979
[FeatureGFX12,
1980+
FeatureAddressableLocalMemorySize65536,
19801981
FeatureLDSBankCount32,
19811982
FeatureDLInsts,
19821983
FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20192020
[FeatureGFX12,
20202021
FeatureGFX1250Insts,
20212022
FeatureCUStores,
2023+
FeatureAddressableLocalMemorySize327680,
20222024
FeatureCuMode,
20232025
Feature64BitLiterals,
20242026
FeatureLDSBankCount32,

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1103,7 +1103,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
11031103
ProgInfo.DX10Clamp = Mode.DX10Clamp;
11041104

11051105
unsigned LDSAlignShift;
1106-
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1106+
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
1107+
// LDS is allocated in 256 dword blocks.
1108+
LDSAlignShift = 10;
1109+
} else if (STM.getFeatureBits().test(
1110+
FeatureAddressableLocalMemorySize163840)) {
11071111
// LDS is allocated in 320 dword blocks.
11081112
LDSAlignShift = 11;
11091113
} else if (STM.getFeatureBits().test(

llvm/lib/Target/AMDGPU/AMDGPUFeatures.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
3030
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
3131
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
3232
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
33+
def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
3334

3435
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
3536
"wavefrontsize"#!shl(1, ValueLog2),

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
11601160
return 65536;
11611161
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
11621162
return 163840;
1163+
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
1164+
return 327680;
11631165
return 0;
11641166
}
11651167

@@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
33403342
}
33413343

33423344
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
3343-
// Currently this is 128 for all subtargets
3344-
return 128;
3345+
return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
3346+
: 128;
33453347
}
33463348

33473349
bool isPackedFP32Inst(unsigned Opc) {

llvm/test/CodeGen/AMDGPU/extra-lds-size.ll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
77
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
88
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
9+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
10+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
911

1012
; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
1113

@@ -29,6 +31,11 @@
2931
; GFX1200-MESA: .long 45100
3032
; GFX1200-MESA-NEXT: .long 1024
3133

34+
; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
35+
36+
; GFX1250-MESA: .long 45100
37+
; GFX1250-MESA-NEXT: .long 512
38+
3239
@lds = internal addrspace(3) global [4096 x i8] poison
3340

3441
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
2+
3+
; GFX1250 supports upto 320 KB LDS memory.
4+
; This is a negative test to check when the LDS size exceeds the max usable limit.
5+
6+
; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
7+
@dst = addrspace(3) global [81921 x i32] undef
8+
9+
define amdgpu_kernel void @test_lds_limit(i32 %val) {
10+
%gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
11+
store i32 %val, ptr addrspace(3) %gep
12+
ret void
13+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
3+
4+
; GFX1250 supports upto 320 KB configurable LDS memory.
5+
; This test checks the min and max size of LDS that can be allocated.
6+
7+
@lds.i8 = addrspace(3) global i8 undef
8+
@lds.array.i8 = addrspace(3) global [327679 x i8] undef
9+
@lds.i16 = addrspace(3) global i16 undef
10+
@lds.array.i16 = addrspace(3) global [163839 x i16] undef
11+
@lds.i32 = addrspace(3) global i32 undef
12+
@lds.array.i32 = addrspace(3) global [81919 x i32] undef
13+
14+
; GCN-LABEL: test_lds_i8:
15+
; GCN: .amdhsa_group_segment_fixed_size 1
16+
; GCN: ; LDSByteSize: 1 bytes/workgroup
17+
; MESA: granulated_lds_size = 1
18+
define amdgpu_kernel void @test_lds_i8(i8 %val) {
19+
store i8 %val, ptr addrspace(3) @lds.i8
20+
ret void
21+
}
22+
23+
; GCN-LABEL: test_lds_i16:
24+
; GCN: .amdhsa_group_segment_fixed_size 2
25+
; GCN: ; LDSByteSize: 2 bytes/workgroup
26+
; MESA: granulated_lds_size = 1
27+
define amdgpu_kernel void @test_lds_i16(i16 %val) {
28+
store i16 %val, ptr addrspace(3) @lds.i16
29+
ret void
30+
}
31+
32+
; GCN-LABEL: test_lds_i32:
33+
; GCN: .amdhsa_group_segment_fixed_size 4
34+
; GCN: ; LDSByteSize: 4 bytes/workgroup
35+
; MESA: granulated_lds_size = 1
36+
define amdgpu_kernel void @test_lds_i32(i32 %val) {
37+
store i32 %val, ptr addrspace(3) @lds.i32
38+
ret void
39+
}
40+
41+
; GCN-LABEL: test_lds_array_i8:
42+
; GCN: .amdhsa_group_segment_fixed_size 327680
43+
; GCN: ; LDSByteSize: 327680 bytes/workgroup
44+
; MESA: granulated_lds_size = 320
45+
define amdgpu_kernel void @test_lds_array_i8() {
46+
%gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
47+
%val = load i8, ptr addrspace(3) %gep
48+
store i8 %val, ptr addrspace(3) @lds.i8
49+
ret void
50+
}
51+
52+
; GCN-LABEL: test_lds_array_i16:
53+
; GCN: .amdhsa_group_segment_fixed_size 327680
54+
; GCN: ; LDSByteSize: 327680 bytes/workgroup
55+
; MESA: granulated_lds_size = 320
56+
define amdgpu_kernel void @test_lds_array_i16() {
57+
%gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
58+
%val = load i16, ptr addrspace(3) %gep
59+
store i16 %val, ptr addrspace(3) @lds.i16
60+
ret void
61+
}
62+
63+
; GCN-LABEL: test_lds_array_i32:
64+
; GCN: .amdhsa_group_segment_fixed_size 327680
65+
; GCN: ; LDSByteSize: 327680 bytes/workgroup
66+
; MESA: granulated_lds_size = 320
67+
define amdgpu_kernel void @test_lds_array_i32() {
68+
%gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
69+
%val = load i32, ptr addrspace(3) %gep
70+
store i32 %val, ptr addrspace(3) @lds.i32
71+
ret void
72+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
2+
3+
; GFX1250 supports upto 320 KB configurable LDS memory.
4+
; This test checks the min and max size of LDS that can be allocated.
5+
6+
; PAL: .shader_functions:
7+
; PAL: test_lds_array_i16:
8+
; PAL: .lds_size: 0x50000
9+
; PAL: test_lds_array_i32:
10+
; PAL: .lds_size: 0x50000
11+
; PAL: test_lds_array_i8:
12+
; PAL: .lds_size: 0x50000
13+
; PAL: test_lds_i16:
14+
; PAL: .lds_size: 0x2
15+
; PAL: test_lds_i32:
16+
; PAL: .lds_size: 0x4
17+
; PAL: test_lds_i8:
18+
; PAL: .lds_size: 0x1
19+
20+
@lds.i8 = addrspace(3) global i8 undef
21+
@lds.array.i8 = addrspace(3) global [327679 x i8] undef
22+
@lds.i16 = addrspace(3) global i16 undef
23+
@lds.array.i16 = addrspace(3) global [163839 x i16] undef
24+
@lds.i32 = addrspace(3) global i32 undef
25+
@lds.array.i32 = addrspace(3) global [81919 x i32] undef
26+
27+
define amdgpu_gfx void @test_lds_i8(i8 %val) {
28+
store i8 %val, ptr addrspace(3) @lds.i8
29+
ret void
30+
}
31+
32+
define amdgpu_gfx void @test_lds_i16(i16 %val) {
33+
store i16 %val, ptr addrspace(3) @lds.i16
34+
ret void
35+
}
36+
37+
define amdgpu_gfx void @test_lds_i32(i32 %val) {
38+
store i32 %val, ptr addrspace(3) @lds.i32
39+
ret void
40+
}
41+
42+
define amdgpu_gfx void @test_lds_array_i8() {
43+
%gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
44+
%val = load i8, ptr addrspace(3) %gep
45+
store i8 %val, ptr addrspace(3) @lds.i8
46+
ret void
47+
}
48+
49+
define amdgpu_gfx void @test_lds_array_i16() {
50+
%gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
51+
%val = load i16, ptr addrspace(3) %gep
52+
store i16 %val, ptr addrspace(3) @lds.i16
53+
ret void
54+
}
55+
56+
define amdgpu_gfx void @test_lds_array_i32() {
57+
%gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
58+
%val = load i32, ptr addrspace(3) %gep
59+
store i32 %val, ptr addrspace(3) @lds.i32
60+
ret void
61+
}

0 commit comments

Comments
 (0)