Skip to content

Commit 0ebd433

Browse files
[AMDGPU] Be less optimistic when allocating module scope lds (#161464)
Make the test for when additional variables can be added to the struct allocated at address zero more stringent. Previously, variables can be added to it (for faster access) even when that increases the lds requested by a kernel. This corrects that oversight. Test case diff shows the change from all variables being allocated into the module lds to only some being, in particular the introduction of uses of the offset table and that some kernels now use less lds than before. Alternative to PR 160181
1 parent a2b6602 commit 0ebd433

File tree

2 files changed

+46
-48
lines changed

2 files changed

+46
-48
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,8 @@ class AMDGPULowerModuleLDS {
608608
? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
609609
: EmptySet;
610610

611+
const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size();
612+
611613
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
612614
// Each iteration of this loop assigns exactly one global variable to
613615
// exactly one of the implementation strategies.
@@ -647,7 +649,8 @@ class AMDGPULowerModuleLDS {
647649
ModuleScopeVariables.insert(GV);
648650
} else if (K.second.size() == 1) {
649651
KernelAccessVariables.insert(GV);
650-
} else if (set_is_subset(K.second, HybridModuleRootKernels)) {
652+
} else if (K.second.size() == HybridModuleRootKernelsSize &&
653+
set_is_subset(K.second, HybridModuleRootKernels)) {
651654
ModuleScopeVariables.insert(GV);
652655
} else {
653656
TableLookupVariables.insert(GV);
Lines changed: 42 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
2-
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
2+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
33

44
; Regression test for issue 160181
55
; One variable is chosen to be assigned at zero. Here, that's @both
@@ -22,12 +22,20 @@
2222
;.
2323
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]]
2424
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
25+
; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]]
26+
; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]]
27+
; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]]
28+
2529
;.
2630
define void @func_one() {
2731
; CHECK-LABEL: define {{[^@]+}}@func_one() {
28-
; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]]
29-
; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]]
30-
; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]]
32+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
33+
; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]]
34+
; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
35+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
36+
; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
37+
; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4
38+
; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]]
3139
; CHECK-NEXT: ret void
3240
;
3341
%val0 = load i32, ptr addrspace(3) @both
@@ -38,9 +46,10 @@ define void @func_one() {
3846

3947
define amdgpu_kernel void @kern_one() {
4048
; CHECK-LABEL: define {{[^@]+}}@kern_one
41-
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
49+
; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] {
4250
; CHECK-NEXT: entry:
43-
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]]
51+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ]
52+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]]
4453
; CHECK-NEXT: call void @func_one()
4554
; CHECK-NEXT: ret void
4655
;
@@ -51,9 +60,13 @@ entry:
5160

5261
define void @func_two() {
5362
; CHECK-LABEL: define {{[^@]+}}@func_two() {
54-
; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
55-
; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]]
56-
; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
63+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
64+
; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
65+
; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
66+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
67+
; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
68+
; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4
69+
; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
5770
; CHECK-NEXT: ret void
5871
;
5972
%val0 = load i32, ptr addrspace(3) @both
@@ -64,9 +77,10 @@ define void @func_two() {
6477

6578
define amdgpu_kernel void @kern_two() {
6679
; CHECK-LABEL: define {{[^@]+}}@kern_two
67-
; CHECK-SAME: () #[[ATTR0]] {
80+
; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] {
6881
; CHECK-NEXT: entry:
69-
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]]
82+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ]
83+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
7084
; CHECK-NEXT: call void @func_two()
7185
; CHECK-NEXT: ret void
7286
;
@@ -82,11 +96,18 @@ entry:
8296
; remains the best candidate for address zero allocation.
8397
define void @func_block_direct_allocation() {
8498
; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() {
85-
; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]]
86-
; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]]
99+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
100+
; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
101+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
102+
; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
103+
; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4
104+
; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
105+
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
106+
; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
107+
; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4
87108
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]]
88-
; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
89-
; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
109+
; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
110+
; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
90111
; CHECK-NEXT: ret void
91112
;
92113
%val1 = load i32, ptr addrspace(3) @one
@@ -99,7 +120,8 @@ define void @func_block_direct_allocation() {
99120

100121
define amdgpu_kernel void @kern_block_direct_allocation() {
101122
; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation
102-
; CHECK-SAME: () #[[ATTR0]] {
123+
; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] {
124+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]]
103125
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
104126
; CHECK-NEXT: call void @func_block_direct_allocation()
105127
; CHECK-NEXT: call void @func_one()
@@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() {
112134
ret void
113135
}
114136
;.
115-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" }
116-
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
117-
;.
118-
; CHECK: [[META0]] = !{i32 0, i32 1}
119-
; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
120-
; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]}
121-
; CHECK: [[META3]] = distinct !{[[META3]]}
122-
; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]}
123-
; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]}
124-
; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
125-
; CHECK: [[META7]] = distinct !{[[META7]]}
126-
; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]}
127-
; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
128-
; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
129-
; CHECK: [[META11]] = distinct !{[[META11]]}
130-
; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]}
131-
; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]}
132-
; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
133-
; CHECK: [[META15]] = distinct !{[[META15]]}
134-
; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]}
135-
; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
136-
; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]}
137-
; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]}
138-
; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]}
139-
; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]}
140-
; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]}
141-
; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]}
142-
; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]}
143-
; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]}
144-
; CHECK: [[META26]] = !{[[META22]]}
145-
; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]}
137+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" }
138+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" }
139+
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
140+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
146141
;.

0 commit comments

Comments
 (0)