Skip to content

Commit d764d00

Browse files
committed
inline syclcompat::local_mem as well
1 parent a4fe915 commit d764d00

File tree

6 files changed

+62
-24
lines changed

6 files changed

+62
-24
lines changed

llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
12+
#include "llvm/ADT/DenseSet.h"
1213
#include "llvm/Demangle/Demangle.h"
1314
#include "llvm/IR/Function.h"
1415
#include "llvm/IR/IRBuilder.h"
@@ -91,29 +92,35 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() {
9192
// inlined first before each __sycl_allocateLocalMemory call can be lowered to a
9293
// unique global variable. Inlining them here so that this pass doesn't have
9394
// implicit dependency on AlwaysInlinerPass.
95+
//
96+
// syclcompat::local_mem, which represents a unique allocation, calls
97+
// group_local_memory_for_overwrite. So local_mem should be inlined as well.
9498
static bool inlineGroupLocalMemoryFunc(Module &M) {
9599
Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
96100
if (!ALMFunc || ALMFunc->use_empty())
97101
return false;
98102

99-
bool Changed = false;
100-
for (auto *U : ALMFunc->users()) {
101-
auto *Caller = cast<CallInst>(U)->getFunction();
102-
if (!Caller->hasFnAttribute("sycl_forceinline")) {
103-
// Already inlined.
104-
continue;
105-
}
106-
for (auto *U2 : make_early_inc_range(Caller->users())) {
107-
auto *CI = cast<CallInst>(U2);
108-
InlineFunctionInfo IFI;
109-
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI);
110-
assert(Result.isSuccess() && "inlining failed");
103+
SmallVector<Function *, 4> WorkList{ALMFunc};
104+
DenseSet<Function *> Visited;
105+
while (!WorkList.empty()) {
106+
auto *F = WorkList.pop_back_val();
107+
for (auto *U : make_early_inc_range(F->users())) {
108+
auto *CI = cast<CallInst>(U);
109+
auto *Caller = CI->getFunction();
110+
if (Caller->hasFnAttribute("sycl-forceinline") &&
111+
Visited.insert(Caller).second)
112+
WorkList.push_back(Caller);
113+
if (F != ALMFunc) {
114+
InlineFunctionInfo IFI;
115+
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI);
116+
assert(Result.isSuccess() && "inlining failed");
117+
}
111118
}
112-
Caller->eraseFromParent();
113-
Changed = true;
114119
}
120+
for (auto *F : Visited)
121+
F->eraseFromParent();
115122

116-
return Changed;
123+
return !Visited.empty();
117124
}
118125

119126
// TODO: It should be checked that __sycl_allocateLocalMemory (or its source

llvm/test/SYCLLowerIR/group_local_memory_inline.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ entry:
6363
declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef)
6464

6565
attributes #0 = { alwaysinline }
66-
attributes #1 = { "sycl_forceinline"="true" }
66+
attributes #1 = { "sycl-forceinline"="true" }

sycl/include/sycl/ext/oneapi/group_local_memory.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ inline namespace _V1 {
2222
namespace ext::oneapi {
2323
template <typename T, typename Group>
2424
#ifdef __SYCL_DEVICE_ONLY__
25-
[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]]
25+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
2626
#endif
2727
std::enable_if_t<
2828
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
@@ -48,7 +48,7 @@ group_local_memory_for_overwrite(Group g) {
4848

4949
template <typename T, typename Group, typename... Args>
5050
#ifdef __SYCL_DEVICE_ONLY__
51-
[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]]
51+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
5252
#endif
5353
std::enable_if_t<
5454
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,

sycl/include/syclcompat/memory.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@
6868

6969
namespace syclcompat {
7070

71-
template <typename AllocT> auto *local_mem() {
71+
template <typename AllocT>
72+
#ifdef __SYCL_DEVICE_ONLY__
73+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
74+
#endif
75+
auto *local_mem() {
7276
sycl::multi_ptr<AllocT, sycl::access::address_space::local_space>
7377
As_multi_ptr =
7478
sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(

sycl/test/check_device_code/extensions/group_local_memory.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
// The test checks that multiple calls to the same template instantiation of a
66
// group local memory function result in separate allocations.
77

8+
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
9+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
10+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
11+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
12+
813
#include <sycl/detail/core.hpp>
914
#include <sycl/ext/oneapi/group_local_memory.hpp>
1015
#include <sycl/usm.hpp>
@@ -31,8 +36,3 @@ int main() {
3136
});
3237
});
3338
}
34-
35-
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
36-
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
37-
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
38-
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
2+
3+
// The test checks that multiple calls to the same template instantiation of
4+
// syclcompat local_mem function result in separate allocations.
5+
6+
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
7+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
8+
9+
#include <sycl/detail/core.hpp>
10+
#include <syclcompat/memory.hpp>
11+
12+
using namespace sycl;
13+
14+
int main() {
15+
queue Q;
16+
17+
int **Out = malloc_shared<int *>(4, Q);
18+
19+
Q.submit([&](handler &Cgh) {
20+
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
21+
auto Ptr0 = syclcompat::local_mem<int[1]>();
22+
auto Ptr1 = syclcompat::local_mem<int[1]>();
23+
Out[0] = Ptr0;
24+
Out[1] = Ptr1;
25+
});
26+
});
27+
}

0 commit comments

Comments
 (0)