Skip to content

Commit 30e1d2f

Browse files
authored
[SYCLLowerIR][GlobalOffset] Fix collection order of load's defs (#20428)
This PR fixes two issues in previous logic: * phi user of @llvm.amdgcn.implicit.offset call is not handled. * post-order collecting load's defs is wrong, producing wrong order in `PtrUses`.
1 parent 49cd4ba commit 30e1d2f

File tree

3 files changed

+184
-24
lines changed

3 files changed

+184
-24
lines changed

llvm/lib/SYCLLowerIR/GlobalOffset.cpp

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,52 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() {
6060
return new GlobalOffsetLegacy();
6161
}
6262

63-
// Recursive helper function to collect Loads from GEPs in a BFS fashion.
64-
static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
65-
SmallVectorImpl<LoadInst *> &Loads) {
66-
Traversed.push_back(P);
67-
if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion
68-
Loads.push_back(L);
69-
else {
70-
assert(isa<GetElementPtrInst>(*P));
71-
for (Value *V : P->users())
72-
getLoads(cast<Instruction>(V), Traversed, Loads);
63+
// Helper function to collect all GEPs, PHIs and Loads in post-order.
64+
static void collectGlobalOffsetUses(Function *ImplicitOffsetIntrinsic,
65+
SmallVectorImpl<Instruction *> &LoadPtrUses,
66+
SmallVectorImpl<Instruction *> &Loads) {
67+
SmallVector<Instruction *, 4> WorkList;
68+
SmallPtrSet<Value *, 4> Visited;
69+
70+
// Find load instructions.
71+
for (auto *U : ImplicitOffsetIntrinsic->users()) {
72+
for (auto *U2 : cast<CallInst>(U)->users()) {
73+
auto *I = cast<Instruction>(U2);
74+
WorkList.push_back(I);
75+
Visited.insert(I);
76+
}
77+
}
78+
while (!WorkList.empty()) {
79+
Instruction *I = WorkList.pop_back_val();
80+
if (isa<PHINode>(I) || isa<GetElementPtrInst>(I)) {
81+
for (User *U : I->users())
82+
if (Visited.insert(U).second)
83+
WorkList.push_back(cast<Instruction>(U));
84+
}
85+
if (isa<LoadInst>(I))
86+
Loads.push_back(I);
87+
}
88+
89+
// For each load, find its defs by post-order walking operand use.
90+
Visited.clear();
91+
for (auto *LI : Loads) {
92+
Use *OpUse0 = &LI->getOperandUse(0);
93+
auto PostOrderTraveral = [&](auto &Self, Use &U) -> void {
94+
auto *I = cast<Instruction>(U.get());
95+
Visited.insert(I);
96+
for (auto &Op : I->operands()) {
97+
auto *OpI = dyn_cast<Instruction>(Op.get());
98+
if (!OpI || isa<CallInst>(OpI))
99+
continue;
100+
if (!Visited.contains(OpI))
101+
Self(Self, Op);
102+
}
103+
if (!isa<CallInst>(I))
104+
LoadPtrUses.push_back(I);
105+
};
106+
Visited.insert(LI);
107+
if (!Visited.contains(OpUse0->get()))
108+
PostOrderTraveral(PostOrderTraveral, *OpUse0);
73109
}
74110
}
75111

@@ -199,32 +235,26 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
199235
// Add implicit parameters to all direct and indirect users of the offset
200236
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr, KCache);
201237
}
202-
SmallVector<CallInst *, 4> Worklist;
203-
SmallVector<LoadInst *, 4> Loads;
238+
SmallVector<Instruction *, 4> Loads;
204239
SmallVector<Instruction *, 4> PtrUses;
205240

206-
// Collect all GEPs and Loads from the intrinsic's CallInsts
207-
for (Value *V : ImplicitOffsetIntrinsic->users()) {
208-
Worklist.push_back(cast<CallInst>(V));
209-
for (Value *V2 : V->users())
210-
getLoads(cast<Instruction>(V2), PtrUses, Loads);
211-
}
241+
collectGlobalOffsetUses(ImplicitOffsetIntrinsic, PtrUses, Loads);
212242

213243
// Replace each use of a collected Load with a Constant 0
214-
for (LoadInst *L : Loads)
244+
for (Instruction *L : Loads) {
215245
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
246+
L->eraseFromParent();
247+
}
216248

217249
// Remove all collected Loads and GEPs from the kernel.
218-
// PtrUses is returned by `getLoads` in topological order.
250+
// PtrUses is returned by `collectGlobalOffsetUses` in topological order.
219251
// Walk it backwards so we don't violate users.
220252
for (auto *I : reverse(PtrUses))
221253
I->eraseFromParent();
222254

223255
// Remove all collected CallInsts from the kernel.
224-
for (CallInst *CI : Worklist) {
225-
auto *I = cast<Instruction>(CI);
226-
I->eraseFromParent();
227-
}
256+
for (auto *U : make_early_inc_range(ImplicitOffsetIntrinsic->users()))
257+
cast<Instruction>(U)->eraseFromParent();
228258

229259
// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
230260
// it.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
2+
3+
; Check that phi is correctly handled in load's defs collection.
4+
5+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
6+
target triple = "amdgcn-amd-amdhsa"
7+
8+
define i64 @test_phi(i32 %x) {
9+
; CHECK-LABEL: define i64 @test_phi(
10+
; CHECK-SAME: i32 [[X:%.*]]) {
11+
; CHECK-NEXT: [[ENTRY:.*:]]
12+
; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [
13+
; CHECK-NEXT: i32 0, label %[[B1:.*]]
14+
; CHECK-NEXT: i32 1, label %[[B2:.*]]
15+
; CHECK-NEXT: i32 2, label %[[B3:.*]]
16+
; CHECK-NEXT: ]
17+
; CHECK: [[B1]]:
18+
; CHECK-NEXT: br label %[[B4:.*]]
19+
; CHECK: [[B2]]:
20+
; CHECK-NEXT: br label %[[B4]]
21+
; CHECK: [[B3]]:
22+
; CHECK-NEXT: br label %[[B4]]
23+
; CHECK: [[B4]]:
24+
; CHECK-NEXT: [[EXT1:%.*]] = zext i32 0 to i64
25+
; CHECK-NEXT: [[EXT2:%.*]] = zext i32 0 to i64
26+
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]]
27+
; CHECK-NEXT: ret i64 [[RES]]
28+
; CHECK: [[B5]]:
29+
; CHECK-NEXT: unreachable
30+
;
31+
entry:
32+
switch i32 %x, label %b5 [
33+
i32 0, label %b1
34+
i32 1, label %b2
35+
i32 2, label %b3
36+
]
37+
38+
b1: ; preds = %entry
39+
%offset0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
40+
br label %b4
41+
42+
b2: ; preds = %entry
43+
%offset1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
44+
%gep1 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset1, i32 4
45+
br label %b4
46+
47+
b3: ; preds = %entry
48+
%offset2 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
49+
%gep2 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset2, i32 8
50+
br label %b4
51+
52+
b4: ; preds = %b3, %b2, %b1
53+
%p = phi ptr addrspace(5) [ %offset0, %b1 ], [ %gep1, %b2 ], [ %gep2, %b3 ]
54+
%load1 = load i32, ptr addrspace(5) %p, align 4
55+
%load2 = load i32, ptr addrspace(5) %p, align 4
56+
%ext1 = zext i32 %load1 to i64
57+
%ext2 = zext i32 %load2 to i64
58+
%res = add nuw nsw i64 %ext1, %ext2
59+
ret i64 %res
60+
61+
b5: ; preds = %entry
62+
unreachable
63+
}
64+
65+
; CHECK-LABEL: define i64 @test_phi_with_offset(
66+
; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(5) [[PTR:%.*]]) {
67+
; CHECK-NEXT: [[ENTRY:.*:]]
68+
; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [
69+
; CHECK-NEXT: i32 0, label %[[B1:.*]]
70+
; CHECK-NEXT: i32 1, label %[[B2:.*]]
71+
; CHECK-NEXT: i32 2, label %[[B3:.*]]
72+
; CHECK-NEXT: ]
73+
; CHECK: [[B1]]:
74+
; CHECK-NEXT: br label %[[B4:.*]]
75+
; CHECK: [[B2]]:
76+
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 4
77+
; CHECK-NEXT: br label %[[B4]]
78+
; CHECK: [[B3]]:
79+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 8
80+
; CHECK-NEXT: br label %[[B4]]
81+
; CHECK: [[B4]]:
82+
; CHECK-NEXT: [[P:%.*]] = phi ptr addrspace(5) [ [[PTR]], %[[B1]] ], [ [[GEP1]], %[[B2]] ], [ [[GEP2]], %[[B3]] ]
83+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(5) [[P]], align 4
84+
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(5) [[P]], align 4
85+
; CHECK-NEXT: [[EXT1:%.*]] = zext i32 [[LOAD1]] to i64
86+
; CHECK-NEXT: [[EXT2:%.*]] = zext i32 [[LOAD2]] to i64
87+
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]]
88+
; CHECK-NEXT: ret i64 [[RES]]
89+
; CHECK: [[B5]]:
90+
; CHECK-NEXT: unreachable
91+
92+
declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
93+
94+
!llvm.module.flags = !{!0}
95+
96+
!0 = !{i32 1, !"sycl-device", i32 1}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
2+
3+
target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
declare ptr @llvm.nvvm.implicit.offset()
7+
8+
define i32 @test_two_loads() {
9+
; CHECK-LABEL: define i32 @test_two_loads() {
10+
; CHECK-NEXT: [[ENTRY:.*:]]
11+
; CHECK-NEXT: [[RES:%.*]] = add i32 0, 0
12+
; CHECK-NEXT: ret i32 [[RES]]
13+
;
14+
entry:
15+
%offset = tail call ptr @llvm.nvvm.implicit.offset()
16+
%gep = getelementptr inbounds nuw i8, ptr %offset, i64 4
17+
%load1 = load i32, ptr %gep, align 4
18+
%load2 = load i32, ptr %offset, align 4
19+
%res = add i32 %load1, %load2
20+
ret i32 %res
21+
}
22+
23+
; CHECK-LABEL: define i32 @test_two_loads_with_offset(
24+
; CHECK-SAME: ptr [[PTR:%.*]]) {
25+
; CHECK-NEXT: [[ENTRY:.*:]]
26+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4
27+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4
28+
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
29+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
30+
; CHECK-NEXT: ret i32 [[RES]]
31+
32+
!llvm.module.flags = !{!0}
33+
34+
!0 = !{i32 1, !"sycl-device", i32 1}

0 commit comments

Comments
 (0)