22// XFAIL: (opencl && !cpu && !accelerator)
33// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
44
5- // RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
5+ // TODO: Currently using the -Wno-deprecated-declarations flag due to issue
6+ // https://github.com/intel/llvm/issues/16451. Rewrite testRootGroup() amd
7+ // remove the flag once the issue is resolved.
8+ // RUN: %{build} -I . -o %t.out -Wno-deprecated-declarations %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
69// RUN: %{run} %t.out
710
811// Disabled temporarily while investigation into the failure is ongoing.
@@ -60,34 +63,6 @@ void testQueriesAndProperties() {
6063 check_max_num_work_group_sync (maxWGsWithLimits);
6164}
6265
63- template <typename T> struct TestKernel1 {
64- T m_data;
65- TestKernel1 (T &data_) : m_data(data_) {}
66- void operator ()(sycl::nd_item<1 > it) const {
67- volatile float X = 1 .0f ;
68- volatile float Y = 1 .0f ;
69- auto root = it.ext_oneapi_get_root_group ();
70- m_data[root.get_local_id ()] = root.get_local_id ();
71- sycl::group_barrier (root);
72- // Delay half of the workgroups with extra work to check that the barrier
73- // synchronizes the whole device.
74- if (it.get_group (0 ) % 2 == 0 ) {
75- X += sycl::sin (X);
76- Y += sycl::cos (Y);
77- }
78- root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1 >();
79- int sum = m_data[root.get_local_id ()] +
80- m_data[root.get_local_range () - root.get_local_id () - 1 ];
81- sycl::group_barrier (root);
82- m_data[root.get_local_id ()] = sum;
83- }
84- auto get (sycl::ext::oneapi::experimental::properties_tag) {
85- return sycl::ext::oneapi::experimental::properties{
86- sycl::ext::oneapi::experimental::use_root_sync};
87- ;
88- }
89- };
90-
9166void testRootGroup () {
9267 sycl::queue q;
9368 const auto bundle =
@@ -98,11 +73,32 @@ void testRootGroup() {
9873 .ext_oneapi_get_info <sycl::ext::oneapi::experimental::info::
9974 kernel_queue_specific::max_num_work_groups>(
10075 q, WorkGroupSize, 0 );
76+ const auto props = sycl::ext::oneapi::experimental::properties{
77+ sycl::ext::oneapi::experimental::use_root_sync};
10178 sycl::buffer<int > dataBuf{sycl::range{maxWGs * WorkGroupSize}};
10279 const auto range = sycl::nd_range<1 >{maxWGs * WorkGroupSize, WorkGroupSize};
10380 q.submit ([&](sycl::handler &h) {
10481 sycl::accessor data{dataBuf, h};
105- h.parallel_for <class RootGroupKernel >(range, TestKernel1 (data));
82+ h.parallel_for <
83+ class RootGroupKernel >(range, props, [=](sycl::nd_item<1 > it) {
84+ volatile float X = 1 .0f ;
85+ volatile float Y = 1 .0f ;
86+ auto root = it.ext_oneapi_get_root_group ();
87+ data[root.get_local_id ()] = root.get_local_id ();
88+ sycl::group_barrier (root);
89+ // Delay half of the workgroups with extra work to check that the barrier
90+ // synchronizes the whole device.
91+ if (it.get_group (0 ) % 2 == 0 ) {
92+ X += sycl::sin (X);
93+ Y += sycl::cos (Y);
94+ }
95+ root =
96+ sycl::ext::oneapi::experimental::this_work_item::get_root_group<1 >();
97+ int sum = data[root.get_local_id ()] +
98+ data[root.get_local_range () - root.get_local_id () - 1 ];
99+ sycl::group_barrier (root);
100+ data[root.get_local_id ()] = sum;
101+ });
106102 });
107103 sycl::host_accessor data{dataBuf};
108104 const int workItemCount = static_cast <int >(range.get_global_range ().size ());
0 commit comments