Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
c9ad9ab
Add tests for work group memory extension
lbushi25 Oct 30, 2024
bd69b8b
Remove unused variable
lbushi25 Oct 30, 2024
83887be
Update reduction_free_function.cpp
lbushi25 Oct 30, 2024
17c4003
Merge branch 'work_group_memory_tests' of https://github.com/lbushi25…
lbushi25 Oct 31, 2024
3964f27
Fix missing aspect runtime errors in tests
lbushi25 Oct 31, 2024
69642a1
Revert "Fix missing aspect runtime errors in tests"
lbushi25 Oct 31, 2024
64096a7
Merge branch 'intel:sycl' into work_group_memory_tests
lbushi25 Oct 31, 2024
db2f720
Fix formatting
lbushi25 Oct 31, 2024
2307672
Fix formatting
lbushi25 Oct 31, 2024
553a127
Fix comment typo in free function kernel test
lbushi25 Oct 31, 2024
d8faebe
Remove ext/intel/math from includes
lbushi25 Oct 31, 2024
adb2331
Remove ext/intel/math from includes
lbushi25 Oct 31, 2024
671bea8
Add comment regarding the limitations of free function kernels
lbushi25 Oct 31, 2024
82cc19d
Fix formatting
lbushi25 Oct 31, 2024
7a21a40
Merge branch 'work_group_memory_tests' of https://github.com/lbushi25…
lbushi25 Nov 4, 2024
0f62c8b
Merge branch 'intel:sycl' into work_group_memory_tests
lbushi25 Nov 4, 2024
942e993
Merge branch 'work_group_memory_tests' of https://github.com/lbushi25…
lbushi25 Nov 4, 2024
f231a26
Add another constructor that takes an argument of indeterminate_t typ…
lbushi25 Nov 4, 2024
209a5b7
Add TODOs to remove diagnostic once feature is supported
lbushi25 Nov 4, 2024
8c203ef
Fix include fails
lbushi25 Nov 4, 2024
3ef139a
Update tests to conform to the spec
lbushi25 Nov 5, 2024
faa3382
Update tests to conform to the spec
lbushi25 Nov 5, 2024
5c0c4b3
Formatting changes
lbushi25 Nov 5, 2024
df3902f
Formatting changes
lbushi25 Nov 5, 2024
61944fc
Remove error limit from WorkGroupMemory test
lbushi25 Nov 6, 2024
bb63b54
Remove indeterminate change and add it in a separate PR
lbushi25 Nov 6, 2024
7b0c7bb
Merge branch 'work_group_memory_tests' of https://github.com/lbushi25…
lbushi25 Nov 6, 2024
3d85911
Apply suggested changes to api_misuse_test.cpp
lbushi25 Nov 6, 2024
3068b89
Add unsupported tracker for cuda failures
lbushi25 Nov 6, 2024
de090f7
Refactor tests by pulling out common functionality
lbushi25 Nov 7, 2024
07d1220
Print message about skipping tests when aspect not supported for a ce…
lbushi25 Nov 7, 2024
188f0fe
Fix merge conflicts
lbushi25 Nov 15, 2024
7595770
Add missing checks for aspect fp64
lbushi25 Nov 15, 2024
67aa143
Fix formatting
lbushi25 Nov 15, 2024
f1d99fa
Add check for empty properties
lbushi25 Nov 15, 2024
1091d6e
Fix error in test logic
lbushi25 Nov 15, 2024
7ecab94
Fix error in test logic
lbushi25 Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/half_type.hpp>

namespace syclexp = sycl::ext::oneapi::experimental;

sycl::queue q;
Expand Down Expand Up @@ -50,7 +52,9 @@ template <typename T> void swap_scalar(T &a, T &b) {
sycl::nd_range<1> ndr{size, wgsize};
cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) {
syclexp::work_group_memory<T> temp2;
temp2 = temp; // temp and temp2 have the same underlying data
temp2 = temp; // temp and temp2 have the same underlying data
assert(&temp2 == &temp); // check that both objects return same
// underlying address after assignment
temp = acc_a[0];
acc_a[0] = acc_b[0];
acc_b[0] = temp2; // safe to use temp2
Expand Down Expand Up @@ -86,6 +90,8 @@ template <typename T> void swap_scalar(T &a, T &b) {
assert(a == old_b && b == old_a && "Incorrect swap!");

// Same as above but instead of using multi_ptr, use address-of operator.
// Also verify that get_multi_ptr() returns the same address as address-of
// operator.
{
sycl::buffer<T, 1> buf_a{&a, 1};
sycl::buffer<T, 1> buf_b{&b, 1};
Expand All @@ -96,6 +102,7 @@ template <typename T> void swap_scalar(T &a, T &b) {
syclexp::work_group_memory<T> temp2{cgh};
sycl::nd_range<1> ndr{size, wgsize};
cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
assert(&temp == temp.get_multi_ptr().get());
temp = acc_a[0];
acc_a[0] = acc_b[0];
temp2 = *(&temp);
Expand Down Expand Up @@ -294,6 +301,8 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) {
temp[i][j] = acc_a[i][j];
acc_a[i][j] = acc_b[i][j];
syclexp::work_group_memory<T[N][N]> temp2{temp};
assert(&temp2 == &temp); // check both objects return same underlying
// address after copy construction.
acc_b[i][j] = temp2[i][j];
});
});
Expand Down Expand Up @@ -342,28 +351,28 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) {
// so we can verify that each work-item sees the value written by its leader.
// The test also is a sanity check that different work groups get different
// work group memory locations as otherwise we'd have data races.
void coherency(size_t size, size_t wgsize) {
template <typename T> void coherency(size_t size, size_t wgsize) {
q.submit([&](sycl::handler &cgh) {
syclexp::work_group_memory<int> data{cgh};
syclexp::work_group_memory<T> data{cgh};
sycl::nd_range<1> ndr{size, wgsize};
cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) {
if (it.get_group().leader()) {
data = it.get_global_id() / wgsize;
data = T(it.get_global_id() / wgsize);
}
sycl::group_barrier(it.get_group());
assert(data == it.get_global_id() / wgsize);
assert(data == T(it.get_global_id() / wgsize));
});
});
}

constexpr size_t N = 32;
int main() {
int intarr1[N][N];
int intarr2[N][N];
template <typename T> void test() {
T intarr1[N][N];
T intarr2[N][N];
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
intarr1[i][j] = i + j;
intarr2[i][j] = i * j;
intarr1[i][j] = T(i) + T(j);
intarr2[i][j] = T(i) * T(j);
}
}
for (int i = 0; i < N; ++i) {
Expand All @@ -373,10 +382,37 @@ int main() {
swap_array_1d(intarr1[i], intarr2[i], 8);
}
swap_array_2d(intarr1, intarr2, 8);
coherency(N, N / 2);
coherency(N, N / 4);
coherency(N, N / 8);
coherency(N, N / 16);
coherency(N, N / 32);
coherency<T>(N, N / 2);
coherency<T>(N, N / 4);
coherency<T>(N, N / 8);
coherency<T>(N, N / 16);
coherency<T>(N, N / 32);
}

template <typename T> void test_ptr() {
T arr1[N][N];
T arr2[N][N];
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
swap_scalar(arr1[i][j], arr2[i][j]);
}
swap_array_1d(arr1[i], arr2[i], 8);
}
swap_array_2d(arr1, arr2, 8);
}

int main() {
test<int>();
test<char>();
test<uint16_t>();
if (q.get_device().has(sycl::aspect::fp16))
test<sycl::half>();
test_ptr<float *>();
test_ptr<int *>();
test_ptr<char *>();
test_ptr<uint16_t *>();
if (q.get_device().has(sycl::aspect::fp16))
test_ptr<sycl::half *>();
test_ptr<float *>();
return 0;
}
266 changes: 266 additions & 0 deletions sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
// REQUIRES: aspect-usm_shared_allocations
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// UNSUPPORTED: cuda
// UNSUPPORTED-INTENDED: The name mangling for free function kernels currently
// does not work with PTX.

// Usage of work group memory parameters in free function kernels is not yet
// implemented.
// TODO: Remove the following directive once
// https://github.com/intel/llvm/pull/15861 is merged.
// XFAIL: *
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15927

#include <cassert>
#include <sycl/detail/core.hpp>
#include <sycl/ext/intel/math.hpp>
#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
#include <sycl/ext/oneapi/free_function_queries.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

// Basic usage reduction test using free function kernels.
// A global buffer is allocated using USM and it is passed to the kernel on the
// device. On the device, a work group memory buffer is allocated and each item
// copies the correspondng element of the global buffer to the corresponding
// element of the work group memory buffer using its global index. The leader of
// every work-group, after waiting for every work-item to complete, then sums
// these values storing the result in another work group memory object. Finally,
// each work item then verifies that the sum of the work group memory elements
// equals the sum of the global buffer elements. This is repeated for several
// data types.

queue q;
context ctx = q.get_context();

constexpr size_t SIZE = 128;
constexpr size_t VEC_SIZE = 16;

template <typename T>
void sum_helper(sycl::ext::oneapi::experimental::work_group_memory<T[]> mem,
sycl::ext::oneapi::experimental::work_group_memory<T> ret,
size_t WGSIZE) {
for (int i = 0; i < WGSIZE; ++i) {
ret = ret + mem[i];
}
}

template <typename T>
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<1>))
void sum(sycl::ext::oneapi::experimental::work_group_memory<T[]> mem, T *buf,
sycl::ext::oneapi::experimental::work_group_memory<T> result,
T expected, size_t WGSIZE, bool UseHelper) {
const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
size_t local_id = it.get_local_id();
mem[local_id] = buf[local_id];
group_barrier(it.get_group());
if (it.get_group().leader()) {
result = 0;
if (!UseHelper) {
for (int i = 0; i < WGSIZE; ++i) {
result = result + mem[i];
}
} else {
sum_helper(mem, result, WGSIZE);
}
assert(result == expected);
}
}

// Explicit instantiations for the relevant data types.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious, why are these explicit instantiations necessary?

Copy link
Contributor Author

@lbushi25 lbushi25 Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some reason, when passing these kernel names to the free function queries with concrete template types, it would complain that these names do not represent kernels because they had not been instantiated yet although one would hope this instantiation would be automatic upon passing these kernel names to the queries.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds like either a bug or a missing feature with free function kernels. Can you add the same comment here about removing this code when free function kernels are fully supported?

Alternatively, maybe we should just delay testing of work_group_memory with free function kernels until there is better support for free function kernels. It seems like there are a lot of places where you need to add workarounds for missing features in free function kernels.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds like either a bug or a missing feature with free function kernels. Can you add the same comment here about removing this code when free function kernels are fully supported?

Alternatively, maybe we should just delay testing of work_group_memory with free function kernels until there is better support for free function kernels. It seems like there are a lot of places where you need to add workarounds for missing features in free function kernels.

Sure. We can keep the test as I've put an XFAIL directive to expect failures on all platforms since support is not implemented yet.

#define SUM(T) \
template void sum<T>( \
sycl::ext::oneapi::experimental::work_group_memory<T[]> mem, T * buf, \
sycl::ext::oneapi::experimental::work_group_memory<T> result, \
T expected, size_t WGSIZE, bool UseHelper);

SUM(int)
SUM(uint16_t)
SUM(half)
SUM(double)
SUM(float)
SUM(char)
SUM(bool)

template <typename T>
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<1>))
void sum_marray(
sycl::ext::oneapi::experimental::work_group_memory<sycl::marray<T, 16>> mem,
T *buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,
T expected) {
const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
size_t local_id = it.get_local_id();
constexpr T tolerance = 0.0001;
sycl::marray<T, 16> &data = mem;
data[local_id] = buf[local_id];
group_barrier(it.get_group());
if (it.get_group().leader()) {
result = 0;
for (int i = 0; i < 16; ++i) {
result = result + data[i];
}
assert((result - expected) * (result - expected) <= tolerance);
}
}

// Explicit instantiations for the relevant data types.
#define SUM_MARRAY(T) \
template void sum_marray<T>( \
sycl::ext::oneapi::experimental::work_group_memory<sycl::marray<T, 16>> \
mem, \
T * buf, sycl::ext::oneapi::experimental::work_group_memory<T> result, \
T expected);

SUM_MARRAY(float);
SUM_MARRAY(double);
SUM_MARRAY(half);

template <typename T>
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<1>))
void sum_vec(
sycl::ext::oneapi::experimental::work_group_memory<sycl::vec<T, 16>> mem,
T *buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,
T expected) {
const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
size_t local_id = it.get_local_id();
constexpr T tolerance = 0.0001;
sycl::vec<T, 16> &data = mem;
data[local_id] = buf[local_id];
group_barrier(it.get_group());
if (it.get_group().leader()) {
result = 0;
for (int i = 0; i < 16; ++i) {
result = result + data[i];
}
assert((result - expected) * (result - expected) <= tolerance);
}
}

// Explicit instantiations for the relevant data types.
#define SUM_VEC(T) \
template void sum_vec<T>( \
sycl::ext::oneapi::experimental::work_group_memory<sycl::vec<T, 16>> \
mem, \
T * buf, sycl::ext::oneapi::experimental::work_group_memory<T> result, \
T expected);

SUM_VEC(float);
SUM_VEC(double);
SUM_VEC(half);

template <typename T, typename... Ts> void test_marray() {
if (std::is_same_v<sycl::half, T> && !q.get_device().has(sycl::aspect::fp16))
return;
if (std::is_same_v<T, double> && !q.get_device().has(aspect::fp64))
return;

constexpr size_t WGSIZE = VEC_SIZE;
T *buf = malloc_shared<T>(WGSIZE, q);
assert(buf && "Shared USM allocation failed!");
T expected = 0;
for (int i = 0; i < WGSIZE; ++i) {
buf[i] = T(i) / WGSIZE;
expected = expected + buf[i];
}
nd_range ndr{{SIZE}, {WGSIZE}};
#ifndef __SYCL_DEVICE_ONLY__
// Get the kernel object for the "mykernel" kernel.
auto Bundle = get_kernel_bundle<sycl::bundle_state::executable>(ctx);
kernel_id sum_id = ext::oneapi::experimental::get_kernel_id<sum_marray<T>>();
kernel k_sum = Bundle.get_kernel(sum_id);
q.submit([&](sycl::handler &cgh) {
ext::oneapi::experimental::work_group_memory<marray<T, WGSIZE>> mem{cgh};
ext::oneapi::experimental ::work_group_memory<T> result{cgh};
cgh.set_args(mem, buf, result, expected);
cgh.parallel_for(ndr, k_sum);
}).wait();
#endif // __SYCL_DEVICE_ONLY
free(buf, q);
if constexpr (sizeof...(Ts))
test_marray<Ts...>();
}

template <typename T, typename... Ts> void test_vec() {
if (std::is_same_v<sycl::half, T> && !q.get_device().has(sycl::aspect::fp16))
return;
if (std::is_same_v<T, double> && !q.get_device().has(aspect::fp64))
return;

constexpr size_t WGSIZE = VEC_SIZE;
T *buf = malloc_shared<T>(WGSIZE, q);
assert(buf && "Shared USM allocation failed!");
T expected = 0;
for (int i = 0; i < WGSIZE; ++i) {
buf[i] = T(i) / WGSIZE;
expected = expected + buf[i];
}
nd_range ndr{{SIZE}, {WGSIZE}};
#ifndef __SYCL_DEVICE_ONLY__
// Get the kernel object for the "mykernel" kernel.
auto Bundle = get_kernel_bundle<sycl::bundle_state::executable>(ctx);
kernel_id sum_id = ext::oneapi::experimental::get_kernel_id<sum_vec<T>>();
kernel k_sum = Bundle.get_kernel(sum_id);
q.submit([&](sycl::handler &cgh) {
ext::oneapi::experimental::work_group_memory<vec<T, WGSIZE>> mem{cgh};
ext::oneapi::experimental ::work_group_memory<T> result{cgh};
cgh.set_args(mem, buf, result, expected);
cgh.parallel_for(ndr, k_sum);
}).wait();
#endif // __SYCL_DEVICE_ONLY
free(buf, q);
if constexpr (sizeof...(Ts))
test_vec<Ts...>();
}

template <typename T, typename... Ts>
void test(size_t SIZE, size_t WGSIZE, bool UseHelper) {
if (std::is_same_v<sycl::half, T> && !q.get_device().has(sycl::aspect::fp16))
return;
if (std::is_same_v<T, double> && !q.get_device().has(aspect::fp64))
return;

T *buf = malloc_shared<T>(WGSIZE, q);
assert(buf && "Shared USM allocation failed!");
T expected = 0;
for (int i = 0; i < WGSIZE; ++i) {
buf[i] = T(i);
expected = expected + buf[i];
}
nd_range ndr{{SIZE}, {WGSIZE}};
// The following ifndef is required due to a number of limitations of free
// function kernels. See CMPLRLLVM-61498.
// TODO: Remove it once these limitations are no longer there.
#ifndef __SYCL_DEVICE_ONLY__
// Get the kernel object for the "mykernel" kernel.
auto Bundle = get_kernel_bundle<sycl::bundle_state::executable>(ctx);
kernel_id sum_id = ext::oneapi::experimental::get_kernel_id<sum<T>>();
kernel k_sum = Bundle.get_kernel(sum_id);
q.submit([&](sycl::handler &cgh) {
ext::oneapi::experimental::work_group_memory<T[]> mem{WGSIZE, cgh};
ext::oneapi::experimental ::work_group_memory<T> result{cgh};
cgh.set_args(mem, buf, result, expected, WGSIZE, UseHelper);
cgh.parallel_for(ndr, k_sum);
}).wait();

#endif // __SYCL_DEVICE_ONLY
free(buf, q);
if constexpr (sizeof...(Ts))
test<Ts...>(SIZE, WGSIZE, UseHelper);
}

int main() {
test<int, uint16_t, half, double, float>(SIZE, SIZE, true /* UseHelper */);
test<int, float, half>(SIZE, SIZE, false);
test<int, double, char>(SIZE, SIZE / 2, false);
test<int, bool, char>(SIZE, SIZE / 4, false);
test_marray<float, double, half>();
test_vec<float, double, half>();
return 0;
}
Loading
Loading