-
Notifications
You must be signed in to change notification settings - Fork 796
[SYCL] Implement work group memory extension #15178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 43 commits
652caa8
76daf77
21e082b
025cbc4
b94f7c9
0d6d694
448071f
9f2973a
852315f
4234022
ae5eb7e
8ce0280
4ee31a5
7b1b90b
cf7476e
50c0954
44811b8
ad1046f
d343a2e
3f1bc30
103e233
bfa5830
2031478
76f0acc
e0ad435
3513251
2cec997
4c8b196
a0b70e2
ed8f125
9460876
ac7130a
e2889b3
d6c78b9
8f7a07b
ae59899
8cff603
3ceead1
3228aeb
0e95ee5
52f13f0
71d1013
d48bc42
f6515bc
3e4c73c
c84229e
d2fddd8
0f677c2
6ef823e
6dc262a
4de6d50
5653f04
40eb63e
f6a0df7
026501c
2ce21b3
a9b2875
3821df4
d73b0b1
c1087ad
31481b8
396169f
dc37b2c
236139f
2beda8e
dbafe31
1b968df
e6b66c3
84ef6a8
f24af09
7dfa80b
3957cb5
91820d8
34bc23d
3acf835
604c640
5510208
d9418f9
e90a3b7
3b9a55a
b9ed6f4
af08c19
b2a97a2
77a6de1
3cb0ba4
1783f75
5a6085f
6affbc3
ed3c60f
fd89473
24f87b0
cba30a3
3b10242
f38f400
38a8d79
4df2f48
9a7e3f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| //===-------------------- work_group_memory.hpp ---------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <type_traits> | ||
|
|
||
| namespace sycl { | ||
| inline namespace _V1 { | ||
| namespace detail { | ||
| template <typename T> struct is_unbounded_array : std::false_type {}; | ||
|
|
||
| template <typename T> struct is_unbounded_array<T[]> : std::true_type {}; | ||
|
|
||
| template <typename T> | ||
| inline constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value; | ||
|
|
||
| class work_group_memory_impl { | ||
| public: | ||
| work_group_memory_impl() : wgm_size{0}, buffer_size{0} {} | ||
| work_group_memory_impl(const work_group_memory_impl &rhs) = default; | ||
| work_group_memory_impl & | ||
| operator=(const work_group_memory_impl &rhs) = default; | ||
| work_group_memory_impl(size_t wgm_size, size_t buffer_size) | ||
| : wgm_size{wgm_size}, buffer_size{buffer_size} {} | ||
| size_t wgm_size; | ||
| size_t buffer_size; | ||
| }; | ||
|
|
||
| inline size_t getWorkGroupMemoryOwnSize(detail::work_group_memory_impl *wgm) { | ||
| return wgm->wgm_size; | ||
| } | ||
| } // namespace detail | ||
|
|
||
| namespace ext::oneapi::experimental { | ||
| template <typename DataT, typename PropertyListT = empty_properties_t> | ||
| class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory | ||
| : sycl::detail::work_group_memory_impl { | ||
| public: | ||
| using value_type = std::remove_all_extents_t<DataT>; | ||
|
|
||
| private: | ||
| using decoratedPtr = typename sycl::detail::DecoratedType< | ||
| value_type, access::address_space::local_space>::type *; | ||
|
|
||
| public: | ||
| work_group_memory() = default; | ||
| work_group_memory(const work_group_memory &rhs) = default; | ||
| work_group_memory &operator=(const work_group_memory &rhs) = default; | ||
| template <typename T = DataT, | ||
| typename = std::enable_if_t<!sycl::detail::is_unbounded_array_v<T>>> | ||
| work_group_memory(handler &) | ||
| : sycl::detail::work_group_memory_impl(sizeof(work_group_memory), | ||
| sizeof(DataT)) {} | ||
| template <typename T = DataT, | ||
| typename = std::enable_if_t<sycl::detail::is_unbounded_array_v<T>>> | ||
| work_group_memory(size_t num, handler &) | ||
| : sycl::detail::work_group_memory_impl( | ||
| sizeof(work_group_memory), | ||
| num * sizeof(std::remove_extent_t<DataT>)) {} | ||
| template <access::decorated IsDecorated = access::decorated::no> | ||
| multi_ptr<value_type, access::address_space::local_space, IsDecorated> | ||
| get_multi_ptr() const { | ||
| return sycl::address_space_cast<access::address_space::local_space, | ||
| IsDecorated, value_type>(ptr); | ||
| } | ||
| DataT *operator&() const { return reinterpret_cast<DataT *>(ptr); } | ||
| operator DataT &() const { return *reinterpret_cast<DataT *>(ptr); } | ||
| template <typename T = DataT, | ||
| typename = std::enable_if_t<!std::is_array_v<T>>> | ||
| const work_group_memory &operator=(const DataT &value) const { | ||
| *ptr = value; | ||
| return *this; | ||
| } | ||
| #ifdef __SYCL_DEVICE_ONLY__ | ||
| void __init(decoratedPtr ptr) { this->ptr = ptr; } | ||
| #endif | ||
| private: | ||
| decoratedPtr ptr; | ||
| }; | ||
| } // namespace ext::oneapi::experimental | ||
| } // namespace _V1 | ||
| } // namespace sycl | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| // RUN: %{build} -o %t.out | ||
| // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s | ||
lbushi25 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| #include <sycl/detail/core.hpp> | ||
| #include <sycl/ext/oneapi/experimental/work_group_memory.hpp> | ||
|
|
||
| // Check that the work group memory object is mapped to exactly one backend | ||
| // kernel argument. | ||
|
|
||
| int main() { | ||
| sycl::queue q; | ||
| q.submit([&](sycl::handler &cgh) { | ||
| sycl::ext::oneapi::experimental::work_group_memory<int[2]> data{cgh}; | ||
| cgh.parallel_for(sycl::nd_range<1>{1, 1}, | ||
| [=](sycl::nd_item<1> it) { data[0] = 42; }); | ||
| }); | ||
| } | ||
|
|
||
| // CHECK-COUNT-1: ---> urKernelSetArg | ||
| // CHECK-NOT: ---> urKernelSetArg | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| // RUN: %{build} -o %t.out | ||
| // RUN: %{run} %t.out | ||
| #include <cassert> | ||
| #include <sycl/sycl.hpp> | ||
| using namespace sycl::ext::oneapi::experimental; | ||
|
|
||
| // As per the spec, a work_group_memory object cannot be used in a single task | ||
| // kernel or in a sycl::range kernel. An exception with error code | ||
| // errc::kernel_argument must be thrown in such cases. This test verifies this. | ||
|
|
||
| int main() { | ||
|
|
||
| sycl::queue q; | ||
| try { | ||
| q.submit([&](sycl::handler &cgh) { | ||
| work_group_memory<int> mem{cgh}; | ||
| cgh.single_task([=]() { mem = 42; }); | ||
| }); | ||
| assert(false && "Work group memory was used in a single_task kernel and an " | ||
| "exception was not seen"); // Fail, exception was not seen | ||
| } catch (sycl::exception &e) { | ||
| // Exception seen but must verify that the error code is correct | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also check for the error message? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be an implementation detail and it could change at any time breaking the test so I'm leaning against it. |
||
| assert(e.code() == sycl::errc::kernel_argument); | ||
| } | ||
| // Same thing but with a range kernel | ||
| try { | ||
| q.submit([&](sycl::handler &cgh) { | ||
| work_group_memory<int> mem{cgh}; | ||
| cgh.parallel_for(sycl::range{1}, [=](sycl::id<> it) { mem = 42; }); | ||
| }); | ||
| assert(false && "Work group memory was used in a range kernel and an " | ||
| "exception was not seen"); // Fail, exception was not seen | ||
| } catch (sycl::exception &e) { | ||
| // Exception seen but must verify that the error code is correct | ||
| assert(e.code() == sycl::errc::kernel_argument); | ||
| } | ||
| return 0; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't looked at this closely, but this looks suspicious. The
work_group_memoryobject is required to correspond to just a single Level Zero kernel parameter, which is a pointer to work-group-local memory. This is the requirement specified here:https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc#dpc-guaranteed-compatibility-with-level-zero-and-opencl-backends
It looks like
work_group_memory_implhas two member variables of typesize_t. In addition, thework_group_memorytype has a member variable of pointer type. Does this mean that the Level Zero kernel will end up with three kernel parameters?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initial testing shows that the L0 kernel only ends up with one parameter, but I will make this into a proper test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder why there is only one Level Zero parameter. Is it because
wgm_sizeandbuffer_sizeare optimized away in the device code? I'd be concerned about this because the requirement of one Level Zero parameter holds even when optimization is disabled.When I look at the code, I don't see any uses of
buffer_size. Is that still needed?If all we need is
wgm_size, I wonder if it would be safer to implementwork_group_memorysuch that its only data member was a union:Or, you could use
std::variantinstead of a union in a similar way.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, my understanding is that the
processArgfunction inhandler.cppfile defines a mapping between the SYCL kernel parameters and the L0 kernel arguments. In the case ofwork_group_memory, I've defined the mapping to be such that whenever the runtime sees a work group memory parameter passed to a SYCL kernel, instead map that to a local memory buffer on the underlying backend where the size of the buffer is given by thebuffer_sizemember of the work group memory object. If you look at my changes inprocessArgfunction, thats exactly what I'm doing. Therefore, with or without optimization, I believe there will only be one L0 kernel argument per work group memory object.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The relevant lines are 793-798 in
handler.cppfile.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, thanks for the explanation!
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lbushi25 is right that this will only result in a single kernel argument, which is due to it being a "SYCL special class" so the arguments of its
__initfunction is used as the arguments of the kernel and the function body is then used for constructing the object.All that said however, I also have some concerns here:
These members are public and by public inheritance they will also be public in@AlexeySachkov corrected me and he's right. Since the inheritor is awork_group_memory. They should not be, however.classthe default inheritance is private.device) where we have astd::shared_ptr<work_group_memory_impl>in this case, wherework_group_memory_implwould be moved to a source file. Then, we could have something like:Note that if we're sure we only ever need the two
size_t, I don't think there's a big problem in having them directly in the class. Reading through it though, I don't fully understand why we need twosize_t. If we could reduce it to one, we could do like Greg suggests further up.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But I was also confused at first, so I think it worth explicitly declaring members of the base class as
protected