Skip to content

Commit 4112fbf

Browse files
author
Victor Lomuller
committed
Add fmoreimprovments
1 parent 9409de9 commit 4112fbf

File tree

10 files changed

+86
-75
lines changed

10 files changed

+86
-75
lines changed

llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,6 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT,
131131
IRBuilder<> Builder(CI);
132132
if (TT.isSPIROrSPIRV())
133133
return Builder.CreateLoad(CI->getType(), LocalMemPlaceholder);
134-
Value *ArgAlign = CI->getArgOperand(0);
135-
Align RequestedAlignment{
136-
cast<llvm::ConstantInt>(ArgAlign)->getZExtValue()};
137-
MaybeAlign CurrentAlignment = LocalMemPlaceholder->getAlign();
138-
if (!CurrentAlignment.has_value() ||
139-
(CurrentAlignment.value() < RequestedAlignment))
140-
LocalMemPlaceholder->setAlignment(RequestedAlignment);
141134

142135
return Builder.CreatePointerCast(LocalMemPlaceholder, CI->getType());
143136
}();
@@ -211,6 +204,8 @@ static bool dynamicWGLocalMemory(Module &M) {
211204
GlobalVariable::NotThreadLocal, // ThreadLocalMode
212205
LocalAS // AddressSpace
213206
);
207+
if (!TT.isSPIROrSPIRV())
208+
LocalMemArrayGV->setAlignment(Align{128});
214209
}
215210
lowerLocalMemCall(DLMFunc, [&](CallInst *CI) {
216211
lowerDynamicLocalMemCallDirect(CI, TT, LocalMemArrayGV);

llvm/test/SYCLLowerIR/work_group_static.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,18 @@
66
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
77
target triple = "spir64-unknown-unknown"
88

9-
; CHECK-DAG: @__sycl_dynamicLocalMemoryPlaceholder_GV = linkonce_odr addrspace(3) global ptr addrspace(3) undef
9+
; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = linkonce_odr addrspace(3) global ptr addrspace(3) undef
1010

1111
; Function Attrs: convergent norecurse
12-
; CHECK-DAG: @_ZTS7KernelA(ptr addrspace(1) %0, ptr addrspace(3) noalias "sycl-implicit-local-arg" %[[IMPLICT_ARG:[a-zA-Z0-9]+]]{{.*}} !kernel_arg_addr_space ![[ADDR_SPACE_MD:[0-9]+]]
12+
; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0, ptr addrspace(3) noalias "sycl-implicit-local-arg" %[[IMPLICT_ARG:[a-zA-Z0-9]+]]{{.*}} !kernel_arg_addr_space ![[ADDR_SPACE_MD:[0-9]+]]
1313
define weak_odr dso_local spir_kernel void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 {
1414
entry:
15-
; CHECK-DAG: store ptr addrspace(3) %[[IMPLICT_ARG]], ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
16-
; CHECK-DAG: %[[LD1:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
15+
; CHECK: store ptr addrspace(3) %[[IMPLICT_ARG]], ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
16+
; CHECK: %[[LD1:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
1717
%1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
18-
; CHECK-DAG: getelementptr inbounds i8, ptr addrspace(3) %[[LD1]]
18+
; CHECK: getelementptr inbounds i8, ptr addrspace(3) %[[LD1]]
1919
%2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4
20-
; CHECK-DAG: %[[LD2:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
20+
; CHECK: %[[LD2:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
2121
%3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1
2222
ret void
2323
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; RUN: opt -S -sycllowerwglocalmemory -bugpoint-enable-legacy-pm < %s | FileCheck %s
2+
; RUN: opt -S -passes=sycllowerwglocalmemory < %s | FileCheck %s
3+
4+
; CHECK-NOT: __sycl_dynamicLocalMemoryPlaceholder
5+
6+
target triple = "nvptx64-nvidia-cuda"
7+
8+
; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = external addrspace(3) global [0 x i8], align 128
9+
10+
; Function Attrs: convergent norecurse
11+
; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0)
12+
define void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 {
13+
entry:
14+
; CHECK: getelementptr inbounds i8, ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
15+
%1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
16+
%2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4
17+
%3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1
18+
ret void
19+
}
20+
21+
; Function Attrs: convergent
22+
declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1
23+
24+
attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
25+
attributes #1 = { convergent norecurse }
26+
27+
!llvm.module.flags = !{!0}
28+
!opencl.spir.version = !{!1}
29+
!spirv.Source = !{!2}
30+
!llvm.ident = !{!3}
31+
32+
!0 = !{i32 1, !"wchar_size", i32 4}
33+
!1 = !{i32 1, i32 2}
34+
!2 = !{i32 4, i32 100000}
35+
!3 = !{!"clang version 13.0.0"}
36+
!4 = !{}
37+
; ![[ADDR_SPACE_MD]] = !{i32 1, i32 3}
38+
!5 = !{i32 1}

sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,7 @@ to a dynamically allocated buffer in the device local memory.
195195

196196
[source,c++]
197197
----
198-
template <typename T>
199-
sycl::multi_ptr<T, access::address_space::local_space,
200-
access::decorated::no>
201-
get_dynamic_work_group_memory()
198+
void* get_dynamic_work_group_memory()
202199
----
203200
_Constraints_: `T` must be trivially constructible and trivially destructible.
204201

@@ -209,9 +206,7 @@ The size of the allocation is unknown at compile-time,
209206
and must be communicated to the SYCL implementation via the
210207
`work_group_static_memory_size` property. Every call to
211208
`get_dynamic_work_group_memory` returns the same allocation
212-
in device local memory, regardless of `T`. For example, two call declared
213-
as `get_dynamic_work_group_memory<int>` and
214-
`get_dynamic_work_group_memory<float>` will be associated with the same shared allocation.
209+
in device local memory.
215210

216211
=== Kernel properties
217212

@@ -290,8 +285,8 @@ using namespace syclex = sycl::ext::oneapi::experimental;
290285
q.parallel_for(sycl::nd_range<1>{N, M},
291286
syclex::properties{syclex::work_group_static_size(M * sizeof(int))},
292287
[=](sycl::nd_item<1> it) {
293-
auto ptr= get_dynamic_work_group_memory<int>();
294-
auto ptr2= get_dynamic_work_group_memory<float>();
288+
auto ptr= get_dynamic_work_group_memory();
289+
auto ptr2= get_dynamic_work_group_memory();
295290
});
296291
----
297292

@@ -302,13 +297,9 @@ This non-normative section provides information about one possible
302297
implementation of this extension. It is not part of the specification of the
303298
extension's API.
304299

305-
For class types and bounded arrays, the class can be implemented using
306-
the WG scope attribute used by hierarchical implementation.
307-
Note, however, that this requires to expose the attribute to the users.
308-
309300
For `get_dynamic_work_group_memory`,
310-
and the implementation may need to generate some additional code to
311-
appropriately initialize the pointer(s) returns by the call.
301+
the implementation may need to generate some additional code to
302+
appropriately initialize the pointer(s) returned by the call.
312303
Alternatively, it may be possible to initialize the pointer to the beginning
313304
of the device's local memory region (if that value is known). Either way, the
314305
implementation must account for the existence of one or more `local_accessor`

sycl/include/sycl/access/access.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ __sycl_allocateLocalMemory(std::size_t Size, std::size_t Alignment);
2323
// Request a placeholder for a dynamically-sized buffer in local address space
2424
// at kernel scope. Required for work_group_static.
2525
extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t *
26-
__sycl_dynamicLocalMemoryPlaceholder(std::size_t Alignment);
26+
__sycl_dynamicLocalMemoryPlaceholder();
2727
#endif
2828
namespace sycl {
2929
inline namespace _V1 {

sycl/include/sycl/ext/oneapi/work_group_static.hpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ template <typename T> class __SYCL_WG_SCOPE work_group_static final {
4646
template <class TArg = T>
4747
typename std::enable_if<!std::is_array_v<TArg>, work_group_static &>::type
4848
operator=(const T &value) noexcept {
49-
*getDecorated() = value;
49+
data = value;
5050
return *this;
5151
}
5252

@@ -56,16 +56,10 @@ template <typename T> class __SYCL_WG_SCOPE work_group_static final {
5656
T data;
5757
};
5858

59-
template <typename T>
60-
std::enable_if_t<
61-
std::is_trivially_destructible_v<T> && std::is_trivially_constructible_v<T>,
62-
multi_ptr<T, access::address_space::local_space, access::decorated::no>>
63-
__SYCL_ALWAYS_INLINE get_dynamic_work_group_memory() {
59+
__SYCL_ALWAYS_INLINE
60+
inline void* get_dynamic_work_group_memory() {
6461
#ifdef __SYCL_DEVICE_ONLY__
65-
return multi_ptr<T, access::address_space::local_space,
66-
access::decorated::no>{
67-
reinterpret_cast<__attribute__((opencl_local)) T *>(
68-
__sycl_dynamicLocalMemoryPlaceholder(alignof(T)))};
62+
return __sycl_dynamicLocalMemoryPlaceholder();
6963
#else
7064
throw sycl::exception(
7165
sycl::errc::feature_not_supported,

sycl/test-e2e/WorkGroupStaticMemory/copy_dynamic_size.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) {
1717
// And then puts in back into B
1818

1919
DataType *smem_ptr =
20-
sycl_ext::get_dynamic_work_group_memory<DataType>().get();
20+
reinterpret_cast<DataType *>(sycl_ext::get_dynamic_work_group_memory());
2121
auto threadIdx_x = it.get_local_linear_id();
2222

2323
smem_ptr[threadIdx_x] = a[threadIdx_x];

sycl/test-e2e/WorkGroupStaticMemory/dynamic_alloc_local_accessor.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,8 @@ int main() {
3434
sycl::local_accessor<int>(WgSize * RepeatWG * sizeof(int), Cgh);
3535
Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
3636
[=](nd_item<1> Item) {
37-
multi_ptr<int, access::address_space::local_space,
38-
sycl::access::decorated::no>
39-
Ptr = sycl::ext::oneapi::experimental::
40-
get_dynamic_work_group_memory<int>();
37+
int *Ptr = reinterpret_cast<int *>(
38+
sycl_ext::get_dynamic_work_group_memory());
4139
size_t GroupOffset =
4240
Item.get_group_linear_id() * ElemPerWG;
4341
for (size_t I = 0; I < RepeatWG; ++I) {

sycl/test-e2e/WorkGroupStaticMemory/dynamic_alloc_ptr_alias.cpp

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,35 +30,32 @@ int main() {
3030
sycl_ext::work_group_static_size static_size(WgSize * RepeatWG *
3131
sizeof(int));
3232
sycl_ext::properties properties{static_size};
33-
Cgh.parallel_for(
34-
nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
35-
[=](nd_item<1> Item) {
36-
multi_ptr<int, access::address_space::local_space,
37-
sycl::access::decorated::no>
38-
Ptr = sycl::ext::oneapi::experimental::
39-
get_dynamic_work_group_memory<int>();
40-
size_t GroupOffset = Item.get_group_linear_id() * ElemPerWG;
41-
for (size_t I = 0; I < RepeatWG; ++I) {
42-
Ptr[WgSize * I + Item.get_local_linear_id()] =
43-
Item.get_local_linear_id();
44-
}
33+
Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
34+
[=](nd_item<1> Item) {
35+
int *Ptr = reinterpret_cast<int *>(
36+
sycl_ext::get_dynamic_work_group_memory());
37+
size_t GroupOffset =
38+
Item.get_group_linear_id() * ElemPerWG;
39+
for (size_t I = 0; I < RepeatWG; ++I) {
40+
Ptr[WgSize * I + Item.get_local_linear_id()] =
41+
Item.get_local_linear_id();
42+
}
4543

46-
Item.barrier();
47-
// Check that multiple calls return the same pointer.
48-
multi_ptr<unsigned int, access::address_space::local_space,
49-
sycl::access::decorated::no>
50-
PtrAlias = sycl::ext::oneapi::experimental::
51-
get_dynamic_work_group_memory<unsigned int>();
44+
Item.barrier();
45+
// Check that multiple calls return the same pointer.
46+
unsigned int *PtrAlias =
47+
reinterpret_cast<unsigned int *>(
48+
sycl_ext::get_dynamic_work_group_memory());
5249

53-
for (size_t I = 0; I < RepeatWG; ++I) {
54-
// Check that the memory is accessible from other
55-
// work-items
56-
size_t BaseIdx = GroupOffset + (I * WgSize);
57-
size_t LocalIdx = Item.get_local_linear_id() ^ 1;
58-
size_t GlobalIdx = BaseIdx + LocalIdx;
59-
Acc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx];
60-
}
61-
});
50+
for (size_t I = 0; I < RepeatWG; ++I) {
51+
// Check that the memory is accessible from other
52+
// work-items
53+
size_t BaseIdx = GroupOffset + (I * WgSize);
54+
size_t LocalIdx = Item.get_local_linear_id() ^ 1;
55+
size_t GlobalIdx = BaseIdx + LocalIdx;
56+
Acc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx];
57+
}
58+
});
6259
});
6360

6461
host_accessor Acc(Buf, read_only);

sycl/test-e2e/WorkGroupStaticMemory/dynamic_allocation.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,8 @@ int main() {
3131
sycl_ext::properties properties{static_size};
3232
Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
3333
[=](nd_item<1> Item) {
34-
multi_ptr<int, access::address_space::local_space,
35-
sycl::access::decorated::no>
36-
Ptr = sycl::ext::oneapi::experimental::
37-
get_dynamic_work_group_memory<int>();
34+
int *Ptr = reinterpret_cast<int *>(
35+
sycl_ext::get_dynamic_work_group_memory());
3836
size_t GroupOffset =
3937
Item.get_group_linear_id() * ElemPerWG;
4038
for (size_t I = 0; I < RepeatWG; ++I) {

0 commit comments

Comments
 (0)