Skip to content
Merged
21 changes: 21 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1573,6 +1573,27 @@ public:
} // namespace syclcompat
```

SYCLcompat provides a wrapper API `max_active_work_groups_per_cu` providing
'work-groups per compute unit' semantics. It is templated on the kernel
functor, and takes a work-group size represented by either `sycl::range<Dim>`
or `syclcompat::dim3`, the local memory size in bytes, and an optional queue.
The function returns the maximum number of work-groups which can be executed
per compute unit. May return *zero* even when below resource limits (i.e.
returning `0` does not imply the kernel cannot execute).
```cpp
namespace syclcompat{
template <class KernelName>
size_t max_active_work_groups_per_cu(
syclcompat::dim3 wg_dim3, size_t local_mem_size,
sycl::queue queue = syclcompat::get_default_queue());

template <class KernelName, int RangeDim>
size_t max_active_work_groups_per_cu(
sycl::range<RangeDim> wg_range, size_t local_mem_size,
sycl::queue queue = syclcompat::get_default_queue());
}
```

To assist machine translation, helper aliases are provided for inlining and
alignment attributes. The class template declarations `sycl_compat_kernel_name`
and `sycl_compat_kernel_scalar` are used to assist automatic generation of
Expand Down
41 changes: 41 additions & 0 deletions sycl/include/syclcompat/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

#include <syclcompat/math.hpp>
#include <syclcompat/memory.hpp>
#include <syclcompat/dims.hpp>

#if defined(__NVPTX__)
#include <sycl/ext/oneapi/experimental/cuda/masked_shuffles.hpp>
Expand Down Expand Up @@ -919,6 +920,46 @@ class group : public group_base<dimensions> {
};
} // namespace experimental

// Calculate the number of work-groups per compute unit
// \tparam [in] KernelName SYCL kernel name to calculate for
// \param [in] q SYCL queue used to execute kernel
// \param [in] wg_dim3 dim3 representing work-group shape
// \param [in] local_mem_size Local memory usage per work-group in bytes
// \return size_t representing maximum work-groups per compute unit
template <class KernelName>
size_t max_active_work_groups_per_cu(
syclcompat::dim3 wg_dim3, size_t local_mem_size,
sycl::queue queue = syclcompat::get_default_queue()) {
namespace syclex = sycl::ext::oneapi::experimental;
// max_num_work_groups only supports range<3>
auto ctx = queue.get_context();
auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
auto kernel = bundle.template get_kernel<KernelName>();
sycl::range<3> wg_range_3d(wg_dim3);
size_t max_wgs = kernel.template ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(queue, wg_range_3d,
local_mem_size);
size_t max_compute_units =
queue.get_device().get_info<sycl::info::device::max_compute_units>();
// Spec dictates max_compute_units > 0, so no need to catch div 0
return max_wgs / max_compute_units;
}

// Calculate the number of work-groups per compute unit
// \tparam [in] KernelName SYCL kernel name to calculate for
// \tparam [in] RangeDim the dimension of the sycl::range
// \param [in] q SYCL queue used to execute kernel
// \param [in] wg_range SYCL work-group range
// \param [in] local_mem_size Local memory usage per work-group in bytes
// \return size_t representing maximum work-groups per compute unit
template <class KernelName, int RangeDim>
size_t max_active_work_groups_per_cu(
sycl::range<RangeDim> wg_range, size_t local_mem_size,
sycl::queue queue = syclcompat::get_default_queue()) {
return max_active_work_groups_per_cu<KernelName>(syclcompat::dim3(wg_range),
local_mem_size, queue);
}

/// If x <= 2, then return a pointer to the default queue;
/// otherwise, return x reinterpreted as a queue_ptr.
inline queue_ptr int_as_queue_ptr(uintptr_t x) {
Expand Down
133 changes: 133 additions & 0 deletions sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/***************************************************************************
*
* Copyright (C) Codeplay Software Ltd.
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SYCLcompat
*
* max_active_work_groups_per_cu.cpp
*
* Description:
* Test the syclcompat::max_active_work_groups_per_cu API
**************************************************************************/
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "sycl/accessor.hpp"
#include <sycl/detail/core.hpp>
#include <syclcompat/util.hpp>

template <class T, size_t Dim>
using sycl_global_accessor =
sycl::accessor<T, Dim, sycl::access::mode::read_write,
sycl::access::target::global_buffer>;

using value_type = int;

template <int RangeDim> struct MyKernel {
MyKernel(sycl_global_accessor<value_type, RangeDim> acc) : acc_{acc} {}
void operator()(sycl::nd_item<RangeDim> item) const {
auto gid = item.get_global_id();
acc_[gid] = item.get_global_linear_id();
}
sycl_global_accessor<value_type, RangeDim> acc_;
static constexpr bool has_local_mem = false;
};

template <int RangeDim> struct MyLocalMemKernel {
MyLocalMemKernel(sycl_global_accessor<value_type, RangeDim> acc,
sycl::local_accessor<value_type, RangeDim> lacc)
: acc_{acc}, lacc_{lacc} {}
void operator()(sycl::nd_item<RangeDim> item) const {
auto gid = item.get_global_id();
acc_[gid] = item.get_global_linear_id();
auto lid = item.get_local_id();
lacc_[lid] = item.get_global_linear_id();
}
sycl_global_accessor<value_type, RangeDim> acc_;
sycl::local_accessor<value_type, RangeDim> lacc_;
static constexpr bool has_local_mem = true;
};

template <template <int> class KernelName, int RangeDim>
void test_max_active_work_groups_per_cu(sycl::queue q,
sycl::range<RangeDim> wg_range,
size_t local_mem_size = 0) {
if constexpr (!KernelName<RangeDim>::has_local_mem)
assert(local_mem_size == 0 && "Bad test setup");

size_t max_per_cu = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
wg_range, local_mem_size, q);

// Check we get the same result passing equivalent dim3
syclcompat::dim3 wg_dim3{wg_range};
size_t max_per_cu_dim3 = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
wg_dim3, local_mem_size, q);
assert(max_per_cu == max_per_cu_dim3);

// Compare w/ reference impl
size_t max_compute_units =
q.get_device().get_info<sycl::info::device::max_compute_units>();
namespace syclex = sycl::ext::oneapi::experimental;
auto ctx = q.get_context();
auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
auto kernel = bundle.template get_kernel<KernelName<RangeDim>>();
size_t max_wgs = kernel.template ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
q, sycl::range<3>{syclcompat::dim3{wg_range}}, local_mem_size);
assert(max_per_cu == max_wgs / max_compute_units);
Copy link
Contributor

@AD2605 AD2605 Oct 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we expect NDEBUG to not to defined during testing ?
Does the CI build and test DPCPP with a specific configuration or is the build type string left empty ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, asserts work as expected during testing. llvm-lit will test on as many backends as are available, AFAIK.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks


// We aren't interested in the launch, it's here to define the kernel
if (false) {
sycl::range<RangeDim> global_range = wg_range;
if(max_per_cu > 0)
global_range[0] = global_range[0] * max_per_cu * max_compute_units;
sycl::nd_range<RangeDim> my_range{global_range, wg_range};
sycl::buffer<value_type, RangeDim> buf{global_range};

q.submit([&](sycl::handler &cgh) {
auto acc = buf.template get_access<sycl::access::mode::read_write>(cgh);
if constexpr (KernelName<RangeDim>::has_local_mem) {
sycl::local_accessor<value_type, RangeDim> lacc(
my_range.get_local_range(), cgh);
cgh.parallel_for(my_range, KernelName<RangeDim>{acc, lacc});
} else {
cgh.parallel_for(my_range, KernelName<RangeDim>{acc});
}
});
}
}

int main() {
sycl::queue q{};
sycl::range<1> range_1d{32};
sycl::range<2> range_2d{1, 32};
sycl::range<3> range_3d{1, 1, 32};
syclcompat::dim3 wg_dim3{32, 1, 1};

size_t lmem_size_small = sizeof(value_type) * 32;
size_t lmem_size_medium = lmem_size_small * 32;
size_t lmem_size_large = lmem_size_medium * 32;

test_max_active_work_groups_per_cu<MyKernel, 3>(q, range_3d);
test_max_active_work_groups_per_cu<MyKernel, 2>(q, range_2d);
test_max_active_work_groups_per_cu<MyKernel, 1>(q, range_1d);
test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
lmem_size_small);
test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
lmem_size_medium);
test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
lmem_size_large);
test_max_active_work_groups_per_cu<MyLocalMemKernel, 1>(q, range_1d,
lmem_size_large);
return 0;
}
Loading