Skip to content

Commit 73ca5fa

Browse files
committed
Update root group query
Signed-off-by: Michael Aziz <[email protected]>
1 parent 7116e9d commit 73ca5fa

File tree

4 files changed

+150
-48
lines changed

4 files changed

+150
-48
lines changed

sycl/include/sycl/kernel.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
170170
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
171171
ext_oneapi_get_info(queue Queue) const;
172172

173+
/// Query queue/launch-specific information from a kernel using the
174+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
175+
/// max_num_work_groups is the only valid descriptor for this function.
176+
///
177+
/// \param Queue is a valid SYCL queue.
178+
/// \param WorkGroupSize is the work-group size the number of work-groups is
179+
/// requested for.
180+
/// \return depends on information being queried.
181+
template <typename Param>
182+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
183+
ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
184+
size_t DynamicLocalMemorySize) const;
185+
186+
/// Query queue/launch-specific information from a kernel using the
187+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
188+
/// max_num_work_groups is the only valid descriptor for this function.
189+
///
190+
/// \param Queue is a valid SYCL queue.
191+
/// \param WorkGroupSize is the work-group size the number of work-groups is
192+
/// requested for.
193+
/// \return depends on information being queried.
194+
template <typename Param>
195+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
196+
ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
197+
size_t DynamicLocalMemorySize) const;
198+
173199
/// Query queue/launch-specific information from a kernel using the
174200
/// info::kernel_queue_specific descriptor for a specific Queue and values.
175201
/// max_num_work_groups is the only valid descriptor for this function.

sycl/source/detail/kernel_impl.cpp

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
106106
"interoperability function or to query a device built-in kernel");
107107
}
108108

109-
bool kernel_impl::exceedsOccupancyResourceLimits(
110-
const device &Device, const range<3> &WorkGroupSize,
111-
size_t DynamicLocalMemorySize) const {
112-
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
113-
// Generally, exceeding hardware resource limits will yield in an error when
114-
// the kernel is launched.
115-
const size_t MaxWorkGroupSize =
116-
get_info<info::kernel_device_specific::work_group_size>(Device);
117-
const size_t MaxLocalMemorySizeInBytes =
118-
Device.get_info<info::device::local_mem_size>();
119-
120-
if (WorkGroupSize.size() > MaxWorkGroupSize)
121-
return true;
122-
123-
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
124-
return true;
125-
126-
// It will be impossible to launch a kernel for Cuda when the hardware limit
127-
// for the 32-bit registers page file size is exceeded.
128-
if (Device.get_backend() == backend::ext_oneapi_cuda) {
129-
const uint32_t RegsPerWorkItem =
130-
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
131-
const uint32_t MaxRegsPerWorkGroup =
132-
Device.get_info<ext::codeplay::experimental::info::device::
133-
max_registers_per_work_group>();
134-
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
135-
return true;
136-
}
137-
138-
return false;
139-
}
140-
141109
template <>
142110
typename info::platform::version::return_type
143111
kernel_impl::get_backend_info<info::platform::version>() const {

sycl/source/detail/kernel_impl.hpp

Lines changed: 102 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,32 @@ class kernel_impl {
122122
template <typename Param>
123123
typename Param::return_type ext_oneapi_get_info(queue Queue) const;
124124

125+
/// Query queue/launch-specific information from a kernel using the
126+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
127+
/// max_num_work_groups is the only valid descriptor for this function.
128+
///
129+
/// \param Queue is a valid SYCL queue.
130+
/// \param WorkGroupSize is the work-group size the number of work-groups is
131+
/// requested for.
132+
/// \return depends on information being queried.
133+
template <typename Param>
134+
typename Param::return_type
135+
ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
136+
size_t DynamicLocalMemorySize) const;
137+
138+
/// Query queue/launch-specific information from a kernel using the
139+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
140+
/// max_num_work_groups is the only valid descriptor for this function.
141+
///
142+
/// \param Queue is a valid SYCL queue.
143+
/// \param WorkGroupSize is the work-group size the number of work-groups is
144+
/// requested for.
145+
/// \return depends on information being queried.
146+
template <typename Param>
147+
typename Param::return_type
148+
ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
149+
size_t DynamicLocalMemorySize) const;
150+
125151
/// Query queue/launch-specific information from a kernel using the
126152
/// info::kernel_queue_specific descriptor for a specific Queue and values.
127153
/// max_num_work_groups is the only valid descriptor for this function.
@@ -192,11 +218,49 @@ class kernel_impl {
192218

193219
/// Check if the occupancy limits are exceeded for the given kernel launch
194220
/// configuration.
221+
template <int Dimensions>
195222
bool exceedsOccupancyResourceLimits(const device &Device,
196-
const range<3> &WorkGroupSize,
223+
const range<Dimensions> &WorkGroupSize,
197224
size_t DynamicLocalMemorySize) const;
225+
template <int Dimensions>
226+
size_t queryMaxNumWorkGroups(queue Queue,
227+
const range<Dimensions> &WorkGroupSize,
228+
size_t DynamicLocalMemorySize) const;
198229
};
199230

231+
template <int Dimensions>
232+
bool kernel_impl::exceedsOccupancyResourceLimits(
233+
const device &Device, const range<Dimensions> &WorkGroupSize,
234+
size_t DynamicLocalMemorySize) const {
235+
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
236+
// Generally, exceeding hardware resource limits will yield in an error when
237+
// the kernel is launched.
238+
const size_t MaxWorkGroupSize =
239+
get_info<info::kernel_device_specific::work_group_size>(Device);
240+
const size_t MaxLocalMemorySizeInBytes =
241+
Device.get_info<info::device::local_mem_size>();
242+
243+
if (WorkGroupSize.size() > MaxWorkGroupSize)
244+
return true;
245+
246+
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
247+
return true;
248+
249+
// It will be impossible to launch a kernel for Cuda when the hardware limit
250+
// for the 32-bit registers page file size is exceeded.
251+
if (Device.get_backend() == backend::ext_oneapi_cuda) {
252+
const uint32_t RegsPerWorkItem =
253+
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
254+
const uint32_t MaxRegsPerWorkGroup =
255+
Device.get_info<ext::codeplay::experimental::info::device::
256+
max_registers_per_work_group>();
257+
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
258+
return true;
259+
}
260+
261+
return false;
262+
}
263+
200264
template <typename Param>
201265
inline typename Param::return_type kernel_impl::get_info() const {
202266
static_assert(is_kernel_info_desc<Param>::value,
@@ -243,13 +307,11 @@ kernel_impl::get_info(const device &Device,
243307

244308
namespace syclex = ext::oneapi::experimental;
245309

246-
template <>
247-
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
248-
return_type
249-
kernel_impl::ext_oneapi_get_info<
250-
syclex::info::kernel_queue_specific::max_num_work_groups>(
251-
queue Queue, const range<3> &WorkGroupSize,
252-
size_t DynamicLocalMemorySize) const {
310+
template <int Dimensions>
311+
size_t
312+
kernel_impl::queryMaxNumWorkGroups(queue Queue,
313+
const range<Dimensions> &WorkGroupSize,
314+
size_t DynamicLocalMemorySize) const {
253315
if (WorkGroupSize.size() == 0)
254316
throw exception(sycl::make_error_code(errc::invalid),
255317
"The launch work-group size cannot be zero.");
@@ -258,10 +320,17 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
258320
const auto &Handle = getHandleRef();
259321
auto Device = Queue.get_device();
260322

323+
size_t WG[Dimensions];
324+
WG[0] = WorkGroupSize[0];
325+
if constexpr (Dimensions >= 2)
326+
WG[1] = WorkGroupSize[1];
327+
if constexpr (Dimensions == 3)
328+
WG[2] = WorkGroupSize[2];
329+
261330
uint32_t GroupCount{0};
262331
if (auto Result = Adapter->call_nocheck<
263332
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
264-
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
333+
Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
265334
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
266335
// The feature is supported. Check for other errors and throw if any.
267336
Adapter->checkUrResult(Result);
@@ -277,15 +346,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
277346
}
278347

279348
template <>
280-
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
349+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
281350
return_type
282351
kernel_impl::ext_oneapi_get_info<
283-
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
352+
syclex::info::kernel_queue_specific::max_num_work_groups>(
353+
queue Queue, const range<1> &WorkGroupSize,
354+
size_t DynamicLocalMemorySize) const {
355+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
356+
}
357+
358+
template <>
359+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
360+
return_type
361+
kernel_impl::ext_oneapi_get_info<
362+
syclex::info::kernel_queue_specific::max_num_work_groups>(
363+
queue Queue, const range<2> &WorkGroupSize,
364+
size_t DynamicLocalMemorySize) const {
365+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
366+
}
367+
368+
template <>
369+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
370+
return_type
371+
kernel_impl::ext_oneapi_get_info<
372+
syclex::info::kernel_queue_specific::max_num_work_groups>(
284373
queue Queue, const range<3> &WorkGroupSize,
285374
size_t DynamicLocalMemorySize) const {
286-
return ext_oneapi_get_info<
287-
syclex::info::kernel_queue_specific::max_num_work_groups>(
288-
Queue, WorkGroupSize, DynamicLocalMemorySize);
375+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
289376
}
290377

291378
template <>
@@ -299,7 +386,7 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
299386
get_info<info::kernel_device_specific::work_group_size>(Device);
300387
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
301388
return ext_oneapi_get_info<
302-
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
389+
syclex::info::kernel_queue_specific::max_num_work_groups>(
303390
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
304391
}
305392

sycl/source/kernel.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,22 @@ kernel::ext_oneapi_get_info(queue Queue) const {
111111
return impl->ext_oneapi_get_info<Param>(Queue);
112112
}
113113

114+
template <typename Param>
115+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
116+
kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
117+
size_t DynamicLocalMemorySize) const {
118+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
119+
DynamicLocalMemorySize);
120+
}
121+
122+
template <typename Param>
123+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
124+
kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
125+
size_t DynamicLocalMemorySize) const {
126+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
127+
DynamicLocalMemorySize);
128+
}
129+
114130
template <typename Param>
115131
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
116132
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
@@ -127,12 +143,17 @@ template __SYCL_EXPORT typename ext::oneapi::experimental::info::
127143

128144
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
129145
template __SYCL_EXPORT ReturnT \
146+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
147+
queue, const range<1> &, size_t) const; \
148+
template __SYCL_EXPORT ReturnT \
149+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
150+
queue, const range<2> &, size_t) const; \
151+
template __SYCL_EXPORT ReturnT \
130152
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
131153
queue, const range<3> &, size_t) const;
132154
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
133155
// kernel_queue_specific queries require the above-defined get_info interface.
134156
// clang-format off
135-
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
136157
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
137158
// clang-format on
138159
#undef __SYCL_PARAM_TRAITS_SPEC

0 commit comments

Comments
 (0)